commit 1e9439e9a8e7213054372eff71554995929203ce
Author: Jiayi Yin <jiayi@iscas.ac.cn>
Date:   Sun Mar 16 16:22:52 2025 +0000

    init

diff --git a/0001-Backport-LoongArch-Add-relax-feature-and-keep-relocations.patch b/0001-Backport-LoongArch-Add-relax-feature-and-keep-relocations.patch
new file mode 100644
index 0000000..9602f6c
--- /dev/null
+++ b/0001-Backport-LoongArch-Add-relax-feature-and-keep-relocations.patch
@@ -0,0 +1,178 @@
+From 6f135b13769c64a6942b4b232a350b6a6207f2b2 Mon Sep 17 00:00:00 2001
+From: Jinyang He <hejinyang@loongson.cn>
+Date: Thu, 16 Nov 2023 11:01:26 +0800
+Subject: [PATCH 02/14] [LoongArch] Add relax feature and keep relocations
+ (#72191)
+
+Add relax feature. To support linker relocation, we should make
+relocation with a symbol rather than section plus offset, and keep all
+relocations with non-abs symbol.
+
+(cherry picked from commit f5bfc833fcbf17a5876911783d1adaca7028d20c)
+Change-Id: Ief38b480016175f2cc9939b74a84d9444559ffd6
+---
+ llvm/lib/Target/LoongArch/LoongArch.td        |  4 +++
+ .../lib/Target/LoongArch/LoongArchSubtarget.h |  2 ++
+ .../MCTargetDesc/LoongArchAsmBackend.cpp      |  5 +--
+ .../MCTargetDesc/LoongArchELFObjectWriter.cpp | 18 ++++++++---
+ .../MCTargetDesc/LoongArchMCTargetDesc.h      |  2 +-
+ .../MC/LoongArch/Relocations/relax-attr.s     | 32 +++++++++++++++++++
+ 6 files changed, 55 insertions(+), 8 deletions(-)
+ create mode 100644 llvm/test/MC/LoongArch/Relocations/relax-attr.s
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
+index 0675caa3b601..75b65fe69f26 100644
+--- a/llvm/lib/Target/LoongArch/LoongArch.td
++++ b/llvm/lib/Target/LoongArch/LoongArch.td
+@@ -102,6 +102,10 @@ def FeatureUAL
+     : SubtargetFeature<"ual", "HasUAL", "true",
+                        "Allow memory accesses to be unaligned">;
+ 
++def FeatureRelax
++    : SubtargetFeature<"relax", "HasLinkerRelax", "true",
++                       "Enable Linker relaxation">;
++
+ //===----------------------------------------------------------------------===//
+ // Registers, instruction descriptions ...
+ //===----------------------------------------------------------------------===//
+diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+index 0fbe23f2f62d..5c173675cca4 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
++++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+@@ -43,6 +43,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
+   bool HasLaGlobalWithAbs = false;
+   bool HasLaLocalWithAbs = false;
+   bool HasUAL = false;
++  bool HasLinkerRelax = false;
+   unsigned GRLen = 32;
+   MVT GRLenVT = MVT::i32;
+   LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown;
+@@ -100,6 +101,7 @@ public:
+   bool hasLaGlobalWithAbs() const { return HasLaGlobalWithAbs; }
+   bool hasLaLocalWithAbs() const { return HasLaLocalWithAbs; }
+   bool hasUAL() const { return HasUAL; }
++  bool hasLinkerRelax() const { return HasLinkerRelax; }
+   MVT getGRLenVT() const { return GRLenVT; }
+   unsigned getGRLen() const { return GRLen; }
+   LoongArchABI::ABI getTargetABI() const { return TargetABI; }
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+index ecb68ff401e9..aae3e544d326 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+@@ -168,7 +168,7 @@ bool LoongArchAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+     return true;
+   switch (Fixup.getTargetKind()) {
+   default:
+-    return false;
++    return STI.hasFeature(LoongArch::FeatureRelax);
+   case FK_Data_1:
+   case FK_Data_2:
+   case FK_Data_4:
+@@ -193,7 +193,8 @@ bool LoongArchAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+ 
+ std::unique_ptr<MCObjectTargetWriter>
+ LoongArchAsmBackend::createObjectTargetWriter() const {
+-  return createLoongArchELFObjectWriter(OSABI, Is64Bit);
++  return createLoongArchELFObjectWriter(
++      OSABI, Is64Bit, STI.hasFeature(LoongArch::FeatureRelax));
+ }
+ 
+ MCAsmBackend *llvm::createLoongArchAsmBackend(const Target &T,
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+index a6b9c0652639..e60b9c2cfd97 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+@@ -20,19 +20,27 @@ using namespace llvm;
+ namespace {
+ class LoongArchELFObjectWriter : public MCELFObjectTargetWriter {
+ public:
+-  LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit);
++  LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit, bool EnableRelax);
+ 
+   ~LoongArchELFObjectWriter() override;
+ 
++  bool needsRelocateWithSymbol(const MCSymbol &Sym,
++                               unsigned Type) const override {
++    return EnableRelax;
++  }
++
+ protected:
+   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                         const MCFixup &Fixup, bool IsPCRel) const override;
++  bool EnableRelax;
+ };
+ } // end namespace
+ 
+-LoongArchELFObjectWriter::LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit)
++LoongArchELFObjectWriter::LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit,
++                                                   bool EnableRelax)
+     : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_LOONGARCH,
+-                              /*HasRelocationAddend*/ true) {}
++                              /*HasRelocationAddend=*/true),
++      EnableRelax(EnableRelax) {}
+ 
+ LoongArchELFObjectWriter::~LoongArchELFObjectWriter() {}
+ 
+@@ -87,6 +95,6 @@ unsigned LoongArchELFObjectWriter::getRelocType(MCContext &Ctx,
+ }
+ 
+ std::unique_ptr<MCObjectTargetWriter>
+-llvm::createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit) {
+-  return std::make_unique<LoongArchELFObjectWriter>(OSABI, Is64Bit);
++llvm::createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit, bool Relax) {
++  return std::make_unique<LoongArchELFObjectWriter>(OSABI, Is64Bit, Relax);
+ }
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h
+index ab35a0096c8a..bb05baa9b717 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h
+@@ -36,7 +36,7 @@ MCAsmBackend *createLoongArchAsmBackend(const Target &T,
+                                         const MCTargetOptions &Options);
+ 
+ std::unique_ptr<MCObjectTargetWriter>
+-createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit);
++createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit, bool Relax);
+ 
+ } // end namespace llvm
+ 
+diff --git a/llvm/test/MC/LoongArch/Relocations/relax-attr.s b/llvm/test/MC/LoongArch/Relocations/relax-attr.s
+new file mode 100644
+index 000000000000..b1e648d850bb
+--- /dev/null
++++ b/llvm/test/MC/LoongArch/Relocations/relax-attr.s
+@@ -0,0 +1,32 @@
++# RUN: llvm-mc --filetype=obj --triple=loongarch64 %s -o %t
++# RUN: llvm-readobj -r %t | FileCheck %s
++# RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=+relax %s -o %t
++# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=CHECKR
++
++# CHECK:      Relocations [
++# CHECK-NEXT:   Section ({{.*}}) .rela.data {
++# CHECK-NEXT:     0x0 R_LARCH_64 .text 0x4
++# CHECK-NEXT:   }
++# CHECK-NEXT: ]
++
++# CHECKR:      Relocations [
++# CHECKR-NEXT:   Section ({{.*}}) .rela.text {
++# CHECKR-NEXT:     0x8 R_LARCH_B21 .L1 0x0
++# CHECKR-NEXT:     0xC R_LARCH_B16 .L1 0x0
++# CHECKR-NEXT:     0x10 R_LARCH_B26 .L1 0x0
++# CHECKR-NEXT:   }
++# CHECKR-NEXT:   Section ({{.*}}) .rela.data {
++# CHECKR-NEXT:     0x0 R_LARCH_64 .L1 0x0
++# CHECKR-NEXT:   }
++# CHECKR-NEXT: ]
++
++.text
++  nop
++.L1:
++  nop
++  beqz $a0, .L1
++  blt  $a0, $a1, .L1
++  b    .L1
++
++.data
++.dword .L1
+-- 
+2.20.1
+
diff --git a/0002-Backport-LoongArch-Allow-delayed-decision-for-ADD-SUB-relocations.patch b/0002-Backport-LoongArch-Allow-delayed-decision-for-ADD-SUB-relocations.patch
new file mode 100644
index 0000000..496e268
--- /dev/null
+++ b/0002-Backport-LoongArch-Allow-delayed-decision-for-ADD-SUB-relocations.patch
@@ -0,0 +1,299 @@
+From 77d74b8fa071fa2695c9782e2e63e7b930895b1b Mon Sep 17 00:00:00 2001
+From: Jinyang He <hejinyang@loongson.cn>
+Date: Wed, 20 Dec 2023 10:54:51 +0800
+Subject: [PATCH 03/14] [LoongArch] Allow delayed decision for ADD/SUB
+ relocations (#72960)
+
+Refer to RISCV [1], LoongArch also need delayed decision for ADD/SUB
+relocations. In handleAddSubRelocations, just return directly if SecA !=
+SecB, handleFixup usually will finish the the rest of creating PCRel
+relocations works. Otherwise we emit relocs depends on whether
+relaxation is enabled. If not, we return true and avoid record ADD/SUB
+relocations.
+Now the two symbols separated by alignment directive will return without
+folding symbol offset in AttemptToFoldSymbolOffsetDifference, which has
+the same effect when relaxation is enabled.
+
+[1] https://reviews.llvm.org/D155357
+
+(cherry picked from commit a8081ed8ff0fd11fb8d5f4c83df49da909e49612)
+Change-Id: Ic4c6a3eb11b576cb0c6ed0eba02150ad67c33cf2
+---
+ llvm/lib/MC/MCExpr.cpp                        |  3 +-
+ .../MCTargetDesc/LoongArchAsmBackend.cpp      | 78 +++++++++++++++++++
+ .../MCTargetDesc/LoongArchAsmBackend.h        |  9 ++-
+ .../MCTargetDesc/LoongArchFixupKinds.h        |  4 +-
+ llvm/test/MC/LoongArch/Misc/subsection.s      | 38 +++++++++
+ .../MC/LoongArch/Relocations/relax-addsub.s   | 68 ++++++++++++++++
+ 6 files changed, 196 insertions(+), 4 deletions(-)
+ create mode 100644 llvm/test/MC/LoongArch/Misc/subsection.s
+ create mode 100644 llvm/test/MC/LoongArch/Relocations/relax-addsub.s
+
+diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
+index a7b980553af0..5a6596f93824 100644
+--- a/llvm/lib/MC/MCExpr.cpp
++++ b/llvm/lib/MC/MCExpr.cpp
+@@ -635,7 +635,8 @@ static void AttemptToFoldSymbolOffsetDifference(
+   // instructions and InSet is false (not expressions in directive like
+   // .size/.fill), disable the fast path.
+   if (Layout && (InSet || !SecA.hasInstructions() ||
+-                 !Asm->getContext().getTargetTriple().isRISCV())) {
++                 !(Asm->getContext().getTargetTriple().isRISCV() ||
++                   Asm->getContext().getTargetTriple().isLoongArch()))) {
+     // If both symbols are in the same fragment, return the difference of their
+     // offsets. canGetFragmentOffset(FA) may be false.
+     if (FA == FB && !SA.isVariable() && !SB.isVariable()) {
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+index aae3e544d326..1ed047a8e632 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+@@ -177,6 +177,34 @@ bool LoongArchAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+   }
+ }
+ 
++static inline std::pair<MCFixupKind, MCFixupKind>
++getRelocPairForSize(unsigned Size) {
++  switch (Size) {
++  default:
++    llvm_unreachable("unsupported fixup size");
++  case 6:
++    return std::make_pair(
++        MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD6),
++        MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB6));
++  case 8:
++    return std::make_pair(
++        MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD8),
++        MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB8));
++  case 16:
++    return std::make_pair(
++        MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD16),
++        MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB16));
++  case 32:
++    return std::make_pair(
++        MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD32),
++        MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB32));
++  case 64:
++    return std::make_pair(
++        MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD64),
++        MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB64));
++  }
++}
++
+ bool LoongArchAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                        const MCSubtargetInfo *STI) const {
+   // We mostly follow binutils' convention here: align to 4-byte boundary with a
+@@ -191,6 +219,56 @@ bool LoongArchAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+   return true;
+ }
+ 
++bool LoongArchAsmBackend::handleAddSubRelocations(const MCAsmLayout &Layout,
++                                                  const MCFragment &F,
++                                                  const MCFixup &Fixup,
++                                                  const MCValue &Target,
++                                                  uint64_t &FixedValue) const {
++  std::pair<MCFixupKind, MCFixupKind> FK;
++  uint64_t FixedValueA, FixedValueB;
++  const MCSection &SecA = Target.getSymA()->getSymbol().getSection();
++  const MCSection &SecB = Target.getSymB()->getSymbol().getSection();
++
++  // We need record relocation if SecA != SecB. Usually SecB is same as the
++  // section of Fixup, which will be record the relocation as PCRel. If SecB
++  // is not same as the section of Fixup, it will report error. Just return
++  // false and then this work can be finished by handleFixup.
++  if (&SecA != &SecB)
++    return false;
++
++  // In SecA == SecB case. If the linker relaxation is enabled, we need record
++  // the ADD, SUB relocations. Otherwise the FixedValue has already been
++  // calculated out in evaluateFixup, return true and avoid record relocations.
++  if (!STI.hasFeature(LoongArch::FeatureRelax))
++    return true;
++
++  switch (Fixup.getKind()) {
++  case llvm::FK_Data_1:
++    FK = getRelocPairForSize(8);
++    break;
++  case llvm::FK_Data_2:
++    FK = getRelocPairForSize(16);
++    break;
++  case llvm::FK_Data_4:
++    FK = getRelocPairForSize(32);
++    break;
++  case llvm::FK_Data_8:
++    FK = getRelocPairForSize(64);
++    break;
++  default:
++    llvm_unreachable("unsupported fixup size");
++  }
++  MCValue A = MCValue::get(Target.getSymA(), nullptr, Target.getConstant());
++  MCValue B = MCValue::get(Target.getSymB());
++  auto FA = MCFixup::create(Fixup.getOffset(), nullptr, std::get<0>(FK));
++  auto FB = MCFixup::create(Fixup.getOffset(), nullptr, std::get<1>(FK));
++  auto &Asm = Layout.getAssembler();
++  Asm.getWriter().recordRelocation(Asm, Layout, &F, FA, A, FixedValueA);
++  Asm.getWriter().recordRelocation(Asm, Layout, &F, FB, B, FixedValueB);
++  FixedValue = FixedValueA - FixedValueB;
++  return true;
++}
++
+ std::unique_ptr<MCObjectTargetWriter>
+ LoongArchAsmBackend::createObjectTargetWriter() const {
+   return createLoongArchELFObjectWriter(
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+index ae9bb8af0419..20f25b5cf53b 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+@@ -31,10 +31,15 @@ class LoongArchAsmBackend : public MCAsmBackend {
+ public:
+   LoongArchAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit,
+                       const MCTargetOptions &Options)
+-      : MCAsmBackend(support::little), STI(STI), OSABI(OSABI), Is64Bit(Is64Bit),
+-        TargetOptions(Options) {}
++      : MCAsmBackend(support::little,
++                     LoongArch::fixup_loongarch_relax),
++        STI(STI), OSABI(OSABI), Is64Bit(Is64Bit), TargetOptions(Options) {}
+   ~LoongArchAsmBackend() override {}
+ 
++  bool handleAddSubRelocations(const MCAsmLayout &Layout, const MCFragment &F,
++                               const MCFixup &Fixup, const MCValue &Target,
++                               uint64_t &FixedValue) const override;
++
+   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                   const MCValue &Target, MutableArrayRef<char> Data,
+                   uint64_t Value, bool IsResolved,
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+index ba2d6718cdf9..178fa6e5262b 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+@@ -106,7 +106,9 @@ enum Fixups {
+   // 20-bit fixup corresponding to %gd_pc_hi20(foo) for instruction pcalau12i.
+   fixup_loongarch_tls_gd_pc_hi20,
+   // 20-bit fixup corresponding to %gd_hi20(foo) for instruction lu12i.w.
+-  fixup_loongarch_tls_gd_hi20
++  fixup_loongarch_tls_gd_hi20,
++  // Generate an R_LARCH_RELAX which indicates the linker may relax here.
++  fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX
+ };
+ } // end namespace LoongArch
+ } // end namespace llvm
+diff --git a/llvm/test/MC/LoongArch/Misc/subsection.s b/llvm/test/MC/LoongArch/Misc/subsection.s
+new file mode 100644
+index 000000000000..0bd22b474536
+--- /dev/null
++++ b/llvm/test/MC/LoongArch/Misc/subsection.s
+@@ -0,0 +1,38 @@
++# RUN: not llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR,NORELAX --implicit-check-not=error:
++## TODO: not llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR,RELAX --implicit-check-not=error:
++
++a:
++  nop
++b:
++  la.pcrel $t0, a
++c:
++  nop
++d:
++
++.data
++## Positive subsection numbers
++## With relaxation, report an error as c-b is not an assemble-time constant.
++# RELAX: :[[#@LINE+1]]:14: error: cannot evaluate subsection number
++.subsection c-b
++# RELAX: :[[#@LINE+1]]:14: error: cannot evaluate subsection number
++.subsection d-b
++# RELAX: :[[#@LINE+1]]:14: error: cannot evaluate subsection number
++.subsection c-a
++
++.subsection b-a
++.subsection d-c
++
++## Negative subsection numbers
++# NORELAX: :[[#@LINE+2]]:14: error: subsection number -8 is not within [0,2147483647]
++# RELAX:   :[[#@LINE+1]]:14: error: cannot evaluate subsection number
++.subsection b-c
++# NORELAX: :[[#@LINE+2]]:14: error: subsection number -12 is not within [0,2147483647]
++# RELAX:   :[[#@LINE+1]]:14: error: cannot evaluate subsection number
++.subsection b-d
++# NORELAX: :[[#@LINE+2]]:14: error: subsection number -12 is not within [0,2147483647]
++# RELAX:   :[[#@LINE+1]]:14: error: cannot evaluate subsection number
++.subsection a-c
++# ERR:     :[[#@LINE+1]]:14: error: subsection number -4 is not within [0,2147483647]
++.subsection a-b
++# ERR:     :[[#@LINE+1]]:14: error: subsection number -4 is not within [0,2147483647]
++.subsection c-d
+diff --git a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
+new file mode 100644
+index 000000000000..532eb4e0561a
+--- /dev/null
++++ b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
+@@ -0,0 +1,68 @@
++# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s \
++# RUN:     | llvm-readobj -r -x .data - | FileCheck %s --check-prefix=NORELAX
++# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s \
++# RUN:     | llvm-readobj -r -x .data - | FileCheck %s --check-prefix=RELAX
++
++# NORELAX:       Relocations [
++# NORELAX-NEXT:    Section ({{.*}}) .rela.text {
++# NORELAX-NEXT:      0x10 R_LARCH_PCALA_HI20 .text 0x0
++# NORELAX-NEXT:      0x14 R_LARCH_PCALA_LO12 .text 0x0
++# NORELAX-NEXT:    }
++# NORELAX-NEXT:  ]
++
++# NORELAX:      Hex dump of section '.data':
++# NORELAX-NEXT: 0x00000000 04040004 00000004 00000000 0000000c
++# NORELAX-NEXT: 0x00000010 0c000c00 00000c00 00000000 00000808
++# NORELAX-NEXT: 0x00000020 00080000 00080000 00000000 00
++
++# RELAX:       Relocations [
++# RELAX-NEXT:    Section ({{.*}}) .rela.text {
++# RELAX-NEXT:      0x10 R_LARCH_PCALA_HI20 .L1 0x0
++# RELAX-NEXT:      0x14 R_LARCH_PCALA_LO12 .L1 0x0
++# RELAX-NEXT:    }
++# RELAX-NEXT:    Section ({{.*}}) .rela.data {
++# RELAX-NEXT:      0xF R_LARCH_ADD8 .L3 0x0
++# RELAX-NEXT:      0xF R_LARCH_SUB8 .L2 0x0
++# RELAX-NEXT:      0x10 R_LARCH_ADD16 .L3 0x0
++# RELAX-NEXT:      0x10 R_LARCH_SUB16 .L2 0x0
++# RELAX-NEXT:      0x12 R_LARCH_ADD32 .L3 0x0
++# RELAX-NEXT:      0x12 R_LARCH_SUB32 .L2 0x0
++# RELAX-NEXT:      0x16 R_LARCH_ADD64 .L3 0x0
++# RELAX-NEXT:      0x16 R_LARCH_SUB64 .L2 0x0
++# RELAX-NEXT:    }
++# RELAX-NEXT:  ]
++
++# RELAX:      Hex dump of section '.data':
++# RELAX-NEXT: 0x00000000 04040004 00000004 00000000 00000000
++# RELAX-NEXT: 0x00000010 00000000 00000000 00000000 00000808
++# RELAX-NEXT: 0x00000020 00080000 00080000 00000000 00
++
++.text
++.L1:
++  nop
++.L2:
++  .align 4
++.L3:
++  la.pcrel $t0, .L1
++.L4:
++  ret
++
++.data
++## Not emit relocs
++.byte  .L2 - .L1
++.short .L2 - .L1
++.word  .L2 - .L1
++.dword .L2 - .L1
++## With relaxation, emit relocs because of the .align making the diff variable.
++## TODO Handle alignment directive. Why they emit relocs now? They returns
++## without folding symbols offset in AttemptToFoldSymbolOffsetDifference().
++.byte  .L3 - .L2
++.short .L3 - .L2
++.word  .L3 - .L2
++.dword .L3 - .L2
++## TODO
++## With relaxation, emit relocs because la.pcrel is a linker-relaxable inst.
++.byte  .L4 - .L3
++.short .L4 - .L3
++.word  .L4 - .L3
++.dword .L4 - .L3
+-- 
+2.20.1
+
diff --git a/0003-Backport-LoongArch-Emit-R_LARCH_RELAX-when-expanding-some-LoadAddress.patch b/0003-Backport-LoongArch-Emit-R_LARCH_RELAX-when-expanding-some-LoadAddress.patch
new file mode 100644
index 0000000..93a2174
--- /dev/null
+++ b/0003-Backport-LoongArch-Emit-R_LARCH_RELAX-when-expanding-some-LoadAddress.patch
@@ -0,0 +1,364 @@
+From f2495d7efb79fdc82af6147f7201d9cf3c91beba Mon Sep 17 00:00:00 2001
+From: Jinyang He <hejinyang@loongson.cn>
+Date: Wed, 27 Dec 2023 08:51:48 +0800
+Subject: [PATCH 04/14] [LoongArch] Emit R_LARCH_RELAX when expanding some
+ LoadAddress (#72961)
+
+Emit relax relocs when expand non-large la.pcrel and non-large la.got on
+llvm-mc stage, which like what does on GAS.
+1, la.pcrel -> PCALA_HI20 + RELAX + PCALA_LO12 + RELAX
+2, la.got -> GOT_PC_HI20 + RELAX + GOT_PC_LO12 + RELAX
+
+(cherry picked from commit b3ef8dce9811b2725639b0d4fac3f85c7e112817)
+Change-Id: I222daf60b36ee70e23c76b753e1d2a3b8148f44b
+---
+ .../AsmParser/LoongArchAsmParser.cpp          | 12 +--
+ .../MCTargetDesc/LoongArchMCCodeEmitter.cpp   | 13 +++
+ .../MCTargetDesc/LoongArchMCExpr.cpp          |  7 +-
+ .../LoongArch/MCTargetDesc/LoongArchMCExpr.h  |  8 +-
+ llvm/test/MC/LoongArch/Macros/macros-la.s     | 84 ++++++++++++++++---
+ llvm/test/MC/LoongArch/Misc/subsection.s      |  2 +-
+ .../MC/LoongArch/Relocations/relax-addsub.s   | 16 +++-
+ 7 files changed, 115 insertions(+), 27 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+index 94d530306536..a132e645c864 100644
+--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
++++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+@@ -86,7 +86,7 @@ class LoongArchAsmParser : public MCTargetAsmParser {
+   // "emitLoadAddress*" functions.
+   void emitLAInstSeq(MCRegister DestReg, MCRegister TmpReg,
+                      const MCExpr *Symbol, SmallVectorImpl<Inst> &Insts,
+-                     SMLoc IDLoc, MCStreamer &Out);
++                     SMLoc IDLoc, MCStreamer &Out, bool RelaxHint = false);
+ 
+   // Helper to emit pseudo instruction "la.abs $rd, sym".
+   void emitLoadAddressAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
+@@ -749,12 +749,14 @@ bool LoongArchAsmParser::ParseInstruction(ParseInstructionInfo &Info,
+ void LoongArchAsmParser::emitLAInstSeq(MCRegister DestReg, MCRegister TmpReg,
+                                        const MCExpr *Symbol,
+                                        SmallVectorImpl<Inst> &Insts,
+-                                       SMLoc IDLoc, MCStreamer &Out) {
++                                       SMLoc IDLoc, MCStreamer &Out,
++                                       bool RelaxHint) {
+   MCContext &Ctx = getContext();
+   for (LoongArchAsmParser::Inst &Inst : Insts) {
+     unsigned Opc = Inst.Opc;
+     LoongArchMCExpr::VariantKind VK = Inst.VK;
+-    const LoongArchMCExpr *LE = LoongArchMCExpr::create(Symbol, VK, Ctx);
++    const LoongArchMCExpr *LE =
++        LoongArchMCExpr::create(Symbol, VK, Ctx, RelaxHint);
+     switch (Opc) {
+     default:
+       llvm_unreachable("unexpected opcode");
+@@ -855,7 +857,7 @@ void LoongArchAsmParser::emitLoadAddressPcrel(MCInst &Inst, SMLoc IDLoc,
+   Insts.push_back(
+       LoongArchAsmParser::Inst(ADDI, LoongArchMCExpr::VK_LoongArch_PCALA_LO12));
+ 
+-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
++  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, true);
+ }
+ 
+ void LoongArchAsmParser::emitLoadAddressPcrelLarge(MCInst &Inst, SMLoc IDLoc,
+@@ -901,7 +903,7 @@ void LoongArchAsmParser::emitLoadAddressGot(MCInst &Inst, SMLoc IDLoc,
+   Insts.push_back(
+       LoongArchAsmParser::Inst(LD, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12));
+ 
+-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
++  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, true);
+ }
+ 
+ void LoongArchAsmParser::emitLoadAddressGotLarge(MCInst &Inst, SMLoc IDLoc,
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+index 03fb9e008ae9..08c0820cb862 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+@@ -19,6 +19,7 @@
+ #include "llvm/MC/MCInstBuilder.h"
+ #include "llvm/MC/MCInstrInfo.h"
+ #include "llvm/MC/MCRegisterInfo.h"
++#include "llvm/MC/MCSubtargetInfo.h"
+ #include "llvm/Support/Casting.h"
+ #include "llvm/Support/EndianStream.h"
+ 
+@@ -120,12 +121,15 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+   assert(MO.isExpr() && "getExprOpValue expects only expressions");
++  bool RelaxCandidate = false;
++  bool EnableRelax = STI.hasFeature(LoongArch::FeatureRelax);
+   const MCExpr *Expr = MO.getExpr();
+   MCExpr::ExprKind Kind = Expr->getKind();
+   LoongArch::Fixups FixupKind = LoongArch::fixup_loongarch_invalid;
+   if (Kind == MCExpr::Target) {
+     const LoongArchMCExpr *LAExpr = cast<LoongArchMCExpr>(Expr);
+ 
++    RelaxCandidate = LAExpr->getRelaxHint();
+     switch (LAExpr->getKind()) {
+     case LoongArchMCExpr::VK_LoongArch_None:
+     case LoongArchMCExpr::VK_LoongArch_Invalid:
+@@ -269,6 +273,15 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
+ 
+   Fixups.push_back(
+       MCFixup::create(0, Expr, MCFixupKind(FixupKind), MI.getLoc()));
++
++  // Emit an R_LARCH_RELAX if linker relaxation is enabled and LAExpr has relax
++  // hint.
++  if (EnableRelax && RelaxCandidate) {
++    const MCConstantExpr *Dummy = MCConstantExpr::create(0, Ctx);
++    Fixups.push_back(MCFixup::create(
++        0, Dummy, MCFixupKind(LoongArch::fixup_loongarch_relax), MI.getLoc()));
++  }
++
+   return 0;
+ }
+ 
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+index 993111552a31..82c992b1cc8c 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+@@ -25,9 +25,10 @@ using namespace llvm;
+ 
+ #define DEBUG_TYPE "loongarch-mcexpr"
+ 
+-const LoongArchMCExpr *
+-LoongArchMCExpr::create(const MCExpr *Expr, VariantKind Kind, MCContext &Ctx) {
+-  return new (Ctx) LoongArchMCExpr(Expr, Kind);
++const LoongArchMCExpr *LoongArchMCExpr::create(const MCExpr *Expr,
++                                               VariantKind Kind, MCContext &Ctx,
++                                               bool Hint) {
++  return new (Ctx) LoongArchMCExpr(Expr, Kind, Hint);
+ }
+ 
+ void LoongArchMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+index 0945cf82db86..93251f824103 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+@@ -67,16 +67,18 @@ public:
+ private:
+   const MCExpr *Expr;
+   const VariantKind Kind;
++  const bool RelaxHint;
+ 
+-  explicit LoongArchMCExpr(const MCExpr *Expr, VariantKind Kind)
+-      : Expr(Expr), Kind(Kind) {}
++  explicit LoongArchMCExpr(const MCExpr *Expr, VariantKind Kind, bool Hint)
++      : Expr(Expr), Kind(Kind), RelaxHint(Hint) {}
+ 
+ public:
+   static const LoongArchMCExpr *create(const MCExpr *Expr, VariantKind Kind,
+-                                       MCContext &Ctx);
++                                       MCContext &Ctx, bool Hint = false);
+ 
+   VariantKind getKind() const { return Kind; }
+   const MCExpr *getSubExpr() const { return Expr; }
++  bool getRelaxHint() const { return RelaxHint; }
+ 
+   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+   bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+diff --git a/llvm/test/MC/LoongArch/Macros/macros-la.s b/llvm/test/MC/LoongArch/Macros/macros-la.s
+index 924e4326b8e5..1a1d12d7d7df 100644
+--- a/llvm/test/MC/LoongArch/Macros/macros-la.s
++++ b/llvm/test/MC/LoongArch/Macros/macros-la.s
+@@ -1,66 +1,128 @@
+ # RUN: llvm-mc --triple=loongarch64 %s | FileCheck %s
++# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o %t
++# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=RELOC
++# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.relax
++# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX
++
++# RELOC:      Relocations [
++# RELOC-NEXT:   Section ({{.*}}) .rela.text {
+ 
+ la.abs $a0, sym_abs
+ # CHECK:      lu12i.w $a0, %abs_hi20(sym_abs)
+ # CHECK-NEXT: ori $a0, $a0, %abs_lo12(sym_abs)
+ # CHECK-NEXT: lu32i.d $a0, %abs64_lo20(sym_abs)
+ # CHECK-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_abs)
++# CHECK-EMPTY:
++# RELOC-NEXT: R_LARCH_ABS_HI20 sym_abs 0x0
++# RELOC-NEXT: R_LARCH_ABS_LO12 sym_abs 0x0
++# RELOC-NEXT: R_LARCH_ABS64_LO20 sym_abs 0x0
++# RELOC-NEXT: R_LARCH_ABS64_HI12 sym_abs 0x0
+ 
+ la.pcrel $a0, sym_pcrel
+-# CHECK:      pcalau12i $a0, %pc_hi20(sym_pcrel)
++# CHECK-NEXT: pcalau12i $a0, %pc_hi20(sym_pcrel)
+ # CHECK-NEXT: addi.d $a0, $a0, %pc_lo12(sym_pcrel)
++# CHECK-EMPTY:
++# RELOC-NEXT: R_LARCH_PCALA_HI20 sym_pcrel 0x0
++# RELAX-NEXT: R_LARCH_RELAX - 0x0
++# RELOC-NEXT: R_LARCH_PCALA_LO12 sym_pcrel 0x0
++# RELAX-NEXT: R_LARCH_RELAX - 0x0
+ 
+ la.pcrel $a0, $a1, sym_pcrel_large
+-# CHECK:      pcalau12i $a0, %pc_hi20(sym_pcrel_large)
++# CHECK-NEXT: pcalau12i $a0, %pc_hi20(sym_pcrel_large)
+ # CHECK-NEXT: addi.d $a1, $zero, %pc_lo12(sym_pcrel_large)
+ # CHECK-NEXT: lu32i.d $a1, %pc64_lo20(sym_pcrel_large)
+ # CHECK-NEXT: lu52i.d $a1, $a1, %pc64_hi12(sym_pcrel_large)
+ # CHECK-NEXT: add.d $a0, $a0, $a1
++# CHECK-EMPTY:
++# RELOC-NEXT: R_LARCH_PCALA_HI20 sym_pcrel_large 0x0
++# RELOC-NEXT: R_LARCH_PCALA_LO12 sym_pcrel_large 0x0
++# RELOC-NEXT: R_LARCH_PCALA64_LO20 sym_pcrel_large 0x0
++# RELOC-NEXT: R_LARCH_PCALA64_HI12 sym_pcrel_large 0x0
+ 
+ la.got $a0, sym_got
+-# CHECK:      pcalau12i $a0, %got_pc_hi20(sym_got)
++# CHECK-NEXT: pcalau12i $a0, %got_pc_hi20(sym_got)
+ # CHECK-NEXT: ld.d $a0, $a0, %got_pc_lo12(sym_got)
++# CHECK-EMPTY:
++# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym_got 0x0
++# RELAX-NEXT: R_LARCH_RELAX - 0x0
++# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_got 0x0
++# RELAX-NEXT: R_LARCH_RELAX - 0x0
+ 
+ la.got $a0, $a1, sym_got_large
+-# CHECK:      pcalau12i $a0, %got_pc_hi20(sym_got_large)
++# CHECK-NEXT: pcalau12i $a0, %got_pc_hi20(sym_got_large)
+ # CHECK-NEXT: addi.d $a1, $zero, %got_pc_lo12(sym_got_large)
+ # CHECK-NEXT: lu32i.d $a1, %got64_pc_lo20(sym_got_large)
+ # CHECK-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(sym_got_large)
+ # CHECK-NEXT: ldx.d $a0, $a0, $a1
++# CHECK-EMPTY:
++# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym_got_large 0x0
++# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_got_large 0x0
++# RELOC-NEXT: R_LARCH_GOT64_PC_LO20 sym_got_large 0x0
++# RELOC-NEXT: R_LARCH_GOT64_PC_HI12 sym_got_large 0x0
+ 
+ la.tls.le $a0, sym_le
+-# CHECK:      lu12i.w $a0, %le_hi20(sym_le)
++# CHECK-NEXT: lu12i.w $a0, %le_hi20(sym_le)
+ # CHECK-NEXT: ori $a0, $a0, %le_lo12(sym_le)
++# CHECK-EMPTY:
++# RELOC-NEXT: R_LARCH_TLS_LE_HI20 sym_le 0x0
++# RELOC-NEXT: R_LARCH_TLS_LE_LO12 sym_le 0x0
+ 
+ la.tls.ie $a0, sym_ie
+-# CHECK:      pcalau12i $a0, %ie_pc_hi20(sym_ie)
++# CHECK-NEXT: pcalau12i $a0, %ie_pc_hi20(sym_ie)
+ # CHECK-NEXT: ld.d $a0, $a0, %ie_pc_lo12(sym_ie)
++# CHECK-EMPTY:
++# RELOC-NEXT: R_LARCH_TLS_IE_PC_HI20 sym_ie 0x0
++# RELOC-NEXT: R_LARCH_TLS_IE_PC_LO12 sym_ie 0x0
+ 
+ la.tls.ie $a0, $a1, sym_ie_large
+-# CHECK:      pcalau12i $a0, %ie_pc_hi20(sym_ie_large)
++# CHECK-NEXT: pcalau12i $a0, %ie_pc_hi20(sym_ie_large)
+ # CHECK-NEXT: addi.d $a1, $zero, %ie_pc_lo12(sym_ie_large)
+ # CHECK-NEXT: lu32i.d $a1, %ie64_pc_lo20(sym_ie_large)
+ # CHECK-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(sym_ie_large)
+ # CHECK-NEXT: ldx.d $a0, $a0, $a1
++# CHECK-EMPTY:
++# RELOC-NEXT: R_LARCH_TLS_IE_PC_HI20 sym_ie_large 0x0
++# RELOC-NEXT: R_LARCH_TLS_IE_PC_LO12 sym_ie_large 0x0
++# RELOC-NEXT: R_LARCH_TLS_IE64_PC_LO20 sym_ie_large 0x0
++# RELOC-NEXT: R_LARCH_TLS_IE64_PC_HI12 sym_ie_large 0x0
+ 
+ la.tls.ld $a0, sym_ld
+-# CHECK:      pcalau12i $a0, %ld_pc_hi20(sym_ld)
++# CHECK-NEXT: pcalau12i $a0, %ld_pc_hi20(sym_ld)
+ # CHECK-NEXT: addi.d $a0, $a0, %got_pc_lo12(sym_ld)
++# CHECK-EMPTY:
++# RELOC-NEXT: R_LARCH_TLS_LD_PC_HI20 sym_ld 0x0
++# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_ld 0x0
+ 
+ la.tls.ld $a0, $a1, sym_ld_large
+-# CHECK:      pcalau12i $a0, %ld_pc_hi20(sym_ld_large)
++# CHECK-NEXT: pcalau12i $a0, %ld_pc_hi20(sym_ld_large)
+ # CHECK-NEXT: addi.d $a1, $zero, %got_pc_lo12(sym_ld_large)
+ # CHECK-NEXT: lu32i.d $a1, %got64_pc_lo20(sym_ld_large)
+ # CHECK-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(sym_ld_large)
+ # CHECK-NEXT: add.d $a0, $a0, $a1
++# CHECK-EMPTY:
++# RELOC-NEXT: R_LARCH_TLS_LD_PC_HI20 sym_ld_large 0x0
++# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_ld_large 0x0
++# RELOC-NEXT: R_LARCH_GOT64_PC_LO20 sym_ld_large 0x0
++# RELOC-NEXT: R_LARCH_GOT64_PC_HI12 sym_ld_large 0x0
+ 
+ la.tls.gd $a0, sym_gd
+-# CHECK:      pcalau12i $a0, %gd_pc_hi20(sym_gd)
++# CHECK-NEXT: pcalau12i $a0, %gd_pc_hi20(sym_gd)
+ # CHECK-NEXT: addi.d $a0, $a0, %got_pc_lo12(sym_gd)
++# CHECK-EMPTY:
++# RELOC-NEXT: R_LARCH_TLS_GD_PC_HI20 sym_gd 0x0
++# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_gd 0x0
+ 
+ la.tls.gd $a0, $a1, sym_gd_large
+-# CHECK:      pcalau12i $a0, %gd_pc_hi20(sym_gd_large)
++# CHECK-NEXT: pcalau12i $a0, %gd_pc_hi20(sym_gd_large)
+ # CHECK-NEXT: addi.d $a1, $zero, %got_pc_lo12(sym_gd_large)
+ # CHECK-NEXT: lu32i.d $a1, %got64_pc_lo20(sym_gd_large)
+ # CHECK-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(sym_gd_large)
+ # CHECK-NEXT: add.d $a0, $a0, $a1
++# CHECK-EMPTY:
++# RELOC-NEXT: R_LARCH_TLS_GD_PC_HI20 sym_gd_large 0x0
++# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_gd_large 0x0
++# RELOC-NEXT: R_LARCH_GOT64_PC_LO20 sym_gd_large 0x0
++# RELOC-NEXT: R_LARCH_GOT64_PC_HI12 sym_gd_large 0x0
++
++# RELOC-NEXT:   }
++# RELOC-NEXT: ]
+diff --git a/llvm/test/MC/LoongArch/Misc/subsection.s b/llvm/test/MC/LoongArch/Misc/subsection.s
+index 0bd22b474536..566a2408d691 100644
+--- a/llvm/test/MC/LoongArch/Misc/subsection.s
++++ b/llvm/test/MC/LoongArch/Misc/subsection.s
+@@ -1,5 +1,5 @@
+ # RUN: not llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR,NORELAX --implicit-check-not=error:
+-## TODO: not llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR,RELAX --implicit-check-not=error:
++# RUN: not llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR,RELAX --implicit-check-not=error:
+ 
+ a:
+   nop
+diff --git a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
+index 532eb4e0561a..c4454f5bb98d 100644
+--- a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
++++ b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
+@@ -18,7 +18,9 @@
+ # RELAX:       Relocations [
+ # RELAX-NEXT:    Section ({{.*}}) .rela.text {
+ # RELAX-NEXT:      0x10 R_LARCH_PCALA_HI20 .L1 0x0
++# RELAX-NEXT:      0x10 R_LARCH_RELAX - 0x0
+ # RELAX-NEXT:      0x14 R_LARCH_PCALA_LO12 .L1 0x0
++# RELAX-NEXT:      0x14 R_LARCH_RELAX - 0x0
+ # RELAX-NEXT:    }
+ # RELAX-NEXT:    Section ({{.*}}) .rela.data {
+ # RELAX-NEXT:      0xF R_LARCH_ADD8 .L3 0x0
+@@ -29,13 +31,21 @@
+ # RELAX-NEXT:      0x12 R_LARCH_SUB32 .L2 0x0
+ # RELAX-NEXT:      0x16 R_LARCH_ADD64 .L3 0x0
+ # RELAX-NEXT:      0x16 R_LARCH_SUB64 .L2 0x0
++# RELAX-NEXT:      0x1E R_LARCH_ADD8 .L4 0x0
++# RELAX-NEXT:      0x1E R_LARCH_SUB8 .L3 0x0
++# RELAX-NEXT:      0x1F R_LARCH_ADD16 .L4 0x0
++# RELAX-NEXT:      0x1F R_LARCH_SUB16 .L3 0x0
++# RELAX-NEXT:      0x21 R_LARCH_ADD32 .L4 0x0
++# RELAX-NEXT:      0x21 R_LARCH_SUB32 .L3 0x0
++# RELAX-NEXT:      0x25 R_LARCH_ADD64 .L4 0x0
++# RELAX-NEXT:      0x25 R_LARCH_SUB64 .L3 0x0
+ # RELAX-NEXT:    }
+ # RELAX-NEXT:  ]
+ 
+ # RELAX:      Hex dump of section '.data':
+ # RELAX-NEXT: 0x00000000 04040004 00000004 00000000 00000000
+-# RELAX-NEXT: 0x00000010 00000000 00000000 00000000 00000808
+-# RELAX-NEXT: 0x00000020 00080000 00080000 00000000 00
++# RELAX-NEXT: 0x00000010 00000000 00000000 00000000 00000000
++# RELAX-NEXT: 0x00000020 00000000 00000000 00000000 00
+ 
+ .text
+ .L1:
+@@ -60,8 +70,6 @@
+ .short .L3 - .L2
+ .word  .L3 - .L2
+ .dword .L3 - .L2
+-## TODO
+-## With relaxation, emit relocs because la.pcrel is a linker-relaxable inst.
+ .byte  .L4 - .L3
+ .short .L4 - .L3
+ .word  .L4 - .L3
+-- 
+2.20.1
+
diff --git a/0004-Backport-MC-LoongArch-Add-AlignFragment-size-if-layout-is-available-and-not-need-insert-nops.patch b/0004-Backport-MC-LoongArch-Add-AlignFragment-size-if-layout-is-available-and-not-need-insert-nops.patch
new file mode 100644
index 0000000..72b7924
--- /dev/null
+++ b/0004-Backport-MC-LoongArch-Add-AlignFragment-size-if-layout-is-available-and-not-need-insert-nops.patch
@@ -0,0 +1,123 @@
+From be6e5c566f49bee5efe3d710bdd321e15d8d95ea Mon Sep 17 00:00:00 2001
+From: Jinyang He <hejinyang@loongson.cn>
+Date: Thu, 14 Mar 2024 12:10:50 +0800
+Subject: [PATCH 05/14] [MC][LoongArch] Add AlignFragment size if layout is
+ available and not need insert nops (#76552)
+
+Due to delayed decision for ADD/SUB relocations, RISCV and LoongArch may
+go slow fragment walk path with available layout. When RISCV (or
+LoongArch in the future) don't need insert nops, that means relax is
+disabled. With available layout and not needing insert nops, the size of
+AlignFragment should be a constant. So we can add it to Displacement for
+folding A-B.
+
+(cherry picked from commit 0731567a31e4ade97c27801045156a88c4589704)
+Change-Id: I554d6766bd7f688204e956e4a6431574b4c511c9
+---
+ llvm/lib/MC/MCExpr.cpp                        |  6 +++++
+ llvm/test/MC/LoongArch/Misc/cfi-advance.s     | 27 +++++++++++++++++++
+ .../MC/LoongArch/Relocations/relax-addsub.s   | 17 +++---------
+ 3 files changed, 37 insertions(+), 13 deletions(-)
+ create mode 100644 llvm/test/MC/LoongArch/Misc/cfi-advance.s
+
+diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
+index 5a6596f93824..a561fed11179 100644
+--- a/llvm/lib/MC/MCExpr.cpp
++++ b/llvm/lib/MC/MCExpr.cpp
+@@ -707,8 +707,14 @@ static void AttemptToFoldSymbolOffsetDifference(
+       }
+ 
+       int64_t Num;
++      unsigned Count;
+       if (DF) {
+         Displacement += DF->getContents().size();
++      } else if (auto *AF = dyn_cast<MCAlignFragment>(FI);
++                 AF && Layout &&
++                 !Asm->getBackend().shouldInsertExtraNopBytesForCodeAlign(
++                     *AF, Count)) {
++        Displacement += Asm->computeFragmentSize(*Layout, *AF);
+       } else if (auto *FF = dyn_cast<MCFillFragment>(FI);
+                  FF && FF->getNumValues().evaluateAsAbsolute(Num)) {
+         Displacement += Num * FF->getValueSize();
+diff --git a/llvm/test/MC/LoongArch/Misc/cfi-advance.s b/llvm/test/MC/LoongArch/Misc/cfi-advance.s
+new file mode 100644
+index 000000000000..662c43e6bcea
+--- /dev/null
++++ b/llvm/test/MC/LoongArch/Misc/cfi-advance.s
+@@ -0,0 +1,27 @@
++# RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=-relax %s -o %t.o
++# RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=RELOC %s
++# RUN: llvm-dwarfdump --debug-frame %t.o | FileCheck --check-prefix=DWARFDUMP %s
++
++# RELOC:       Relocations [
++# RELOC-NEXT:    .rela.eh_frame {
++# RELOC-NEXT:       0x1C R_LARCH_32_PCREL .text 0x0
++# RELOC-NEXT:    }
++# RELOC-NEXT:  ]
++# DWARFDUMP:       DW_CFA_advance_loc: 4
++# DWARFDUMP-NEXT:  DW_CFA_def_cfa_offset: +8
++# DWARFDUMP-NEXT:  DW_CFA_advance_loc: 8
++# DWARFDUMP-NEXT:  DW_CFA_def_cfa_offset: +8
++
++        .text
++        .globl test
++        .p2align 2
++        .type   test,@function
++test:
++        .cfi_startproc
++        nop
++        .cfi_def_cfa_offset 8
++        .p2align 3
++        nop
++        .cfi_def_cfa_offset 8
++        nop
++        .cfi_endproc
+diff --git a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
+index c4454f5bb98d..14922657ae89 100644
+--- a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
++++ b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
+@@ -23,14 +23,6 @@
+ # RELAX-NEXT:      0x14 R_LARCH_RELAX - 0x0
+ # RELAX-NEXT:    }
+ # RELAX-NEXT:    Section ({{.*}}) .rela.data {
+-# RELAX-NEXT:      0xF R_LARCH_ADD8 .L3 0x0
+-# RELAX-NEXT:      0xF R_LARCH_SUB8 .L2 0x0
+-# RELAX-NEXT:      0x10 R_LARCH_ADD16 .L3 0x0
+-# RELAX-NEXT:      0x10 R_LARCH_SUB16 .L2 0x0
+-# RELAX-NEXT:      0x12 R_LARCH_ADD32 .L3 0x0
+-# RELAX-NEXT:      0x12 R_LARCH_SUB32 .L2 0x0
+-# RELAX-NEXT:      0x16 R_LARCH_ADD64 .L3 0x0
+-# RELAX-NEXT:      0x16 R_LARCH_SUB64 .L2 0x0
+ # RELAX-NEXT:      0x1E R_LARCH_ADD8 .L4 0x0
+ # RELAX-NEXT:      0x1E R_LARCH_SUB8 .L3 0x0
+ # RELAX-NEXT:      0x1F R_LARCH_ADD16 .L4 0x0
+@@ -43,8 +35,8 @@
+ # RELAX-NEXT:  ]
+ 
+ # RELAX:      Hex dump of section '.data':
+-# RELAX-NEXT: 0x00000000 04040004 00000004 00000000 00000000
+-# RELAX-NEXT: 0x00000010 00000000 00000000 00000000 00000000
++# RELAX-NEXT: 0x00000000 04040004 00000004 00000000 0000000c
++# RELAX-NEXT: 0x00000010 0c000c00 00000c00 00000000 00000000
+ # RELAX-NEXT: 0x00000020 00000000 00000000 00000000 00
+ 
+ .text
+@@ -63,13 +55,12 @@
+ .short .L2 - .L1
+ .word  .L2 - .L1
+ .dword .L2 - .L1
+-## With relaxation, emit relocs because of the .align making the diff variable.
+-## TODO Handle alignment directive. Why they emit relocs now? They returns
+-## without folding symbols offset in AttemptToFoldSymbolOffsetDifference().
++## TODO Handle alignment directive.
+ .byte  .L3 - .L2
+ .short .L3 - .L2
+ .word  .L3 - .L2
+ .dword .L3 - .L2
++## With relaxation, emit relocs because the la.pcrel makes the diff variable.
+ .byte  .L4 - .L3
+ .short .L4 - .L3
+ .word  .L4 - .L3
+-- 
+2.20.1
+
diff --git a/0005-Backport-LoongArch-RISCV-Support-R_LARCH_-ADD-SUB-_ULEB128-R_RISCV_-SET-SUB-_ULEB128-for-uleb128-directives.patch b/0005-Backport-LoongArch-RISCV-Support-R_LARCH_-ADD-SUB-_ULEB128-R_RISCV_-SET-SUB-_ULEB128-for-uleb128-directives.patch
new file mode 100644
index 0000000..c1c4f9f
--- /dev/null
+++ b/0005-Backport-LoongArch-RISCV-Support-R_LARCH_-ADD-SUB-_ULEB128-R_RISCV_-SET-SUB-_ULEB128-for-uleb128-directives.patch
@@ -0,0 +1,633 @@
+From 8d7b71890179d32474b3a1a1c627481bd5a2327d Mon Sep 17 00:00:00 2001
+From: zhanglimin <zhanglimin@loongson.cn>
+Date: Fri, 15 Mar 2024 14:39:48 +0800
+Subject: [PATCH 06/14] [LoongArch][RISCV] Support
+ R_LARCH_{ADD,SUB}_ULEB128/R_RISCV_{SET,SUB}_ULEB128 for .uleb128 directives
+
+This patch is originally from three upstream commits:
+1, R_LARCH_{ADD,SUB}_ULEB128 are originally landed from b57159cb(#76433).
+2, R_RISCV_{SET,SUB}_ULEB128 are originally supported from 1df5ea29. Among it, we change
+the default behaviour of `-riscv-uleb128-reloc` to not produce uleb128 reloc, in order
+to avoid any other side-effects due to the updated implementation of `MCAssembler::relaxLEB()`
+function. And at the same time, we ensure that this patch can't introduce new default traits
+(such as the generation for uleb128 reloc) on RISCV in this version.
+3, Fix invalid-sleb.s in original commit d7398a35.
+
+Change-Id: Ie687b7d8483c76cf647141162641db1a9d819a04
+---
+ .../llvm/BinaryFormat/ELFRelocs/RISCV.def     |  2 +
+ llvm/include/llvm/MC/MCAsmBackend.h           |  8 +++
+ llvm/include/llvm/MC/MCFixup.h                |  1 +
+ llvm/include/llvm/MC/MCFragment.h             |  9 ++-
+ llvm/lib/MC/MCAsmBackend.cpp                  |  1 +
+ llvm/lib/MC/MCAssembler.cpp                   | 39 ++++++++--
+ .../MCTargetDesc/LoongArchAsmBackend.cpp      | 69 ++++++++++++++----
+ .../MCTargetDesc/LoongArchAsmBackend.h        |  3 +
+ .../RISCV/MCTargetDesc/RISCVAsmBackend.cpp    | 27 +++++++
+ .../RISCV/MCTargetDesc/RISCVAsmBackend.h      |  2 +
+ llvm/test/MC/ELF/RISCV/gen-dwarf.s            |  5 +-
+ llvm/test/MC/LoongArch/Relocations/leb128.s   | 72 +++++++++++++++++++
+ .../MC/LoongArch/Relocations/relax-addsub.s   | 57 +++++++++++----
+ llvm/test/MC/X86/invalid-sleb.s               |  5 --
+ 14 files changed, 252 insertions(+), 48 deletions(-)
+ create mode 100644 llvm/test/MC/LoongArch/Relocations/leb128.s
+ delete mode 100644 llvm/test/MC/X86/invalid-sleb.s
+
+diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV.def
+index 9a126df01531..c7fd6490041c 100644
+--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV.def
++++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV.def
+@@ -55,3 +55,5 @@ ELF_RELOC(R_RISCV_SET32,             56)
+ ELF_RELOC(R_RISCV_32_PCREL,          57)
+ ELF_RELOC(R_RISCV_IRELATIVE,         58)
+ ELF_RELOC(R_RISCV_PLT32,             59)
++ELF_RELOC(R_RISCV_SET_ULEB128,       60)
++ELF_RELOC(R_RISCV_SUB_ULEB128,       61)
+diff --git a/llvm/include/llvm/MC/MCAsmBackend.h b/llvm/include/llvm/MC/MCAsmBackend.h
+index 5e08fb41679b..968a767b17f8 100644
+--- a/llvm/include/llvm/MC/MCAsmBackend.h
++++ b/llvm/include/llvm/MC/MCAsmBackend.h
+@@ -21,6 +21,7 @@ class MCAlignFragment;
+ class MCDwarfCallFrameFragment;
+ class MCDwarfLineAddrFragment;
+ class MCFragment;
++class MCLEBFragment;
+ class MCRelaxableFragment;
+ class MCSymbol;
+ class MCAsmLayout;
+@@ -194,6 +195,13 @@ public:
+     return false;
+   }
+ 
++  // Defined by linker relaxation targets to possibly emit LEB128 relocations
++  // and set Value at the relocated location.
++  virtual std::pair<bool, bool>
++  relaxLEB128(MCLEBFragment &LF, MCAsmLayout &Layout, int64_t &Value) const {
++    return std::make_pair(false, false);
++  }
++
+   /// @}
+ 
+   /// Returns the minimum size of a nop in bytes on this target. The assembler
+diff --git a/llvm/include/llvm/MC/MCFixup.h b/llvm/include/llvm/MC/MCFixup.h
+index 069ca058310f..7f48a90cb1ec 100644
+--- a/llvm/include/llvm/MC/MCFixup.h
++++ b/llvm/include/llvm/MC/MCFixup.h
+@@ -25,6 +25,7 @@ enum MCFixupKind {
+   FK_Data_4,      ///< A four-byte fixup.
+   FK_Data_8,      ///< A eight-byte fixup.
+   FK_Data_6b,     ///< A six-bits fixup.
++  FK_Data_leb128, ///< A leb128 fixup.
+   FK_PCRel_1,     ///< A one-byte pc relative fixup.
+   FK_PCRel_2,     ///< A two-byte pc relative fixup.
+   FK_PCRel_4,     ///< A four-byte pc relative fixup.
+diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h
+index 7be4792a4521..e965732010fe 100644
+--- a/llvm/include/llvm/MC/MCFragment.h
++++ b/llvm/include/llvm/MC/MCFragment.h
+@@ -428,7 +428,7 @@ public:
+   }
+ };
+ 
+-class MCLEBFragment : public MCFragment {
++class MCLEBFragment final : public MCEncodedFragmentWithFixups<10, 1> {
+   /// True if this is a sleb128, false if uleb128.
+   bool IsSigned;
+ 
+@@ -439,17 +439,16 @@ class MCLEBFragment : public MCFragment {
+ 
+ public:
+   MCLEBFragment(const MCExpr &Value_, bool IsSigned_, MCSection *Sec = nullptr)
+-      : MCFragment(FT_LEB, false, Sec), IsSigned(IsSigned_), Value(&Value_) {
++      : MCEncodedFragmentWithFixups<10, 1>(FT_LEB, false, Sec),
++        IsSigned(IsSigned_), Value(&Value_) {
+     Contents.push_back(0);
+   }
+ 
+   const MCExpr &getValue() const { return *Value; }
++  void setValue(const MCExpr *Expr) { Value = Expr; }
+ 
+   bool isSigned() const { return IsSigned; }
+ 
+-  SmallString<8> &getContents() { return Contents; }
+-  const SmallString<8> &getContents() const { return Contents; }
+-
+   /// @}
+ 
+   static bool classof(const MCFragment *F) {
+diff --git a/llvm/lib/MC/MCAsmBackend.cpp b/llvm/lib/MC/MCAsmBackend.cpp
+index 64bbc63719c7..2eef7d363fe7 100644
+--- a/llvm/lib/MC/MCAsmBackend.cpp
++++ b/llvm/lib/MC/MCAsmBackend.cpp
+@@ -89,6 +89,7 @@ const MCFixupKindInfo &MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+       {"FK_Data_4", 0, 32, 0},
+       {"FK_Data_8", 0, 64, 0},
+       {"FK_Data_6b", 0, 6, 0},
++      {"FK_Data_leb128", 0, 0, 0},
+       {"FK_PCRel_1", 0, 8, MCFixupKindInfo::FKF_IsPCRel},
+       {"FK_PCRel_2", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
+       {"FK_PCRel_4", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
+index 55ed1a285cd7..86c798ec9e27 100644
+--- a/llvm/lib/MC/MCAssembler.cpp
++++ b/llvm/lib/MC/MCAssembler.cpp
+@@ -918,6 +918,12 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
+         Contents = DF.getContents();
+         break;
+       }
++      case MCFragment::FT_LEB: {
++        auto &LF = cast<MCLEBFragment>(Frag);
++        Fixups = LF.getFixups();
++        Contents = LF.getContents();
++        break;
++      }
+       case MCFragment::FT_PseudoProbe: {
+         MCPseudoProbeAddrFragment &PF = cast<MCPseudoProbeAddrFragment>(Frag);
+         Fixups = PF.getFixups();
+@@ -1006,12 +1012,31 @@ bool MCAssembler::relaxInstruction(MCAsmLayout &Layout,
+ }
+ 
+ bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) {
+-  uint64_t OldSize = LF.getContents().size();
++  const unsigned OldSize = static_cast<unsigned>(LF.getContents().size());
++  unsigned PadTo = OldSize;
+   int64_t Value;
+-  bool Abs = LF.getValue().evaluateKnownAbsolute(Value, Layout);
+-  if (!Abs)
+-    report_fatal_error("sleb128 and uleb128 expressions must be absolute");
+-  SmallString<8> &Data = LF.getContents();
++  SmallVectorImpl<char> &Data = LF.getContents();
++  LF.getFixups().clear();
++  // Use evaluateKnownAbsolute for Mach-O as a hack: .subsections_via_symbols
++  // requires that .uleb128 A-B is foldable where A and B reside in different
++  // fragments. This is used by __gcc_except_table.
++  bool Abs = getSubsectionsViaSymbols()
++                 ? LF.getValue().evaluateKnownAbsolute(Value, Layout)
++                 : LF.getValue().evaluateAsAbsolute(Value, Layout);
++  if (!Abs) {
++    bool Relaxed, UseZeroPad;
++    std::tie(Relaxed, UseZeroPad) = getBackend().relaxLEB128(LF, Layout, Value);
++    if (!Relaxed) {
++      getContext().reportError(LF.getValue().getLoc(),
++                               Twine(LF.isSigned() ? ".s" : ".u") +
++                                   "leb128 expression is not absolute");
++      LF.setValue(MCConstantExpr::create(0, Context));
++    }
++    uint8_t Tmp[10]; // maximum size: ceil(64/7)
++    PadTo = std::max(PadTo, encodeULEB128(uint64_t(Value), Tmp));
++    if (UseZeroPad)
++      Value = 0;
++  }
+   Data.clear();
+   raw_svector_ostream OSE(Data);
+   // The compiler can generate EH table assembly that is impossible to assemble
+@@ -1019,9 +1044,9 @@ bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) {
+   // to a later alignment fragment. To accommodate such tables, relaxation can
+   // only increase an LEB fragment size here, not decrease it. See PR35809.
+   if (LF.isSigned())
+-    encodeSLEB128(Value, OSE, OldSize);
++    encodeSLEB128(Value, OSE, PadTo);
+   else
+-    encodeULEB128(Value, OSE, OldSize);
++    encodeULEB128(Value, OSE, PadTo);
+   return OldSize != LF.getContents().size();
+ }
+ 
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+index 1ed047a8e632..9227d4d6afed 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+@@ -92,6 +92,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+   case FK_Data_2:
+   case FK_Data_4:
+   case FK_Data_8:
++  case FK_Data_leb128:
+     return Value;
+   case LoongArch::fixup_loongarch_b16: {
+     if (!isInt<18>(Value))
+@@ -129,6 +130,15 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+   }
+ }
+ 
++static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup,
++                        MutableArrayRef<char> Data, uint64_t Value) {
++  unsigned I;
++  for (I = 0; I != Data.size() && Value; ++I, Value >>= 7)
++    Data[I] |= uint8_t(Value & 0x7f);
++  if (Value)
++    Ctx.reportError(Fixup.getLoc(), "Invalid uleb128 value!");
++}
++
+ void LoongArchAsmBackend::applyFixup(const MCAssembler &Asm,
+                                      const MCFixup &Fixup,
+                                      const MCValue &Target,
+@@ -144,6 +154,10 @@ void LoongArchAsmBackend::applyFixup(const MCAssembler &Asm,
+   MCFixupKindInfo Info = getFixupKindInfo(Kind);
+   MCContext &Ctx = Asm.getContext();
+ 
++  // Fixup leb128 separately.
++  if (Fixup.getTargetKind() == FK_Data_leb128)
++    return fixupLeb128(Ctx, Fixup, Data, Value);
++
+   // Apply any target-specific value adjustments.
+   Value = adjustFixupValue(Fixup, Value, Ctx);
+ 
+@@ -173,6 +187,7 @@ bool LoongArchAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+   case FK_Data_2:
+   case FK_Data_4:
+   case FK_Data_8:
++  case FK_Data_leb128:
+     return !Target.isAbsolute();
+   }
+ }
+@@ -202,9 +217,24 @@ getRelocPairForSize(unsigned Size) {
+     return std::make_pair(
+         MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD64),
+         MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB64));
++  case 128:
++    return std::make_pair(
++        MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD_ULEB128),
++        MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB_ULEB128));
+   }
+ }
+ 
++std::pair<bool, bool> LoongArchAsmBackend::relaxLEB128(MCLEBFragment &LF,
++                                                       MCAsmLayout &Layout,
++                                                       int64_t &Value) const {
++  const MCExpr &Expr = LF.getValue();
++  if (LF.isSigned() || !Expr.evaluateKnownAbsolute(Value, Layout))
++    return std::make_pair(false, false);
++  LF.getFixups().push_back(
++      MCFixup::create(0, &Expr, FK_Data_leb128, Expr.getLoc()));
++  return std::make_pair(true, true);
++}
++
+ bool LoongArchAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                        const MCSubtargetInfo *STI) const {
+   // We mostly follow binutils' convention here: align to 4-byte boundary with a
+@@ -226,21 +256,27 @@ bool LoongArchAsmBackend::handleAddSubRelocations(const MCAsmLayout &Layout,
+                                                   uint64_t &FixedValue) const {
+   std::pair<MCFixupKind, MCFixupKind> FK;
+   uint64_t FixedValueA, FixedValueB;
+-  const MCSection &SecA = Target.getSymA()->getSymbol().getSection();
+-  const MCSection &SecB = Target.getSymB()->getSymbol().getSection();
+-
+-  // We need record relocation if SecA != SecB. Usually SecB is same as the
+-  // section of Fixup, which will be record the relocation as PCRel. If SecB
+-  // is not same as the section of Fixup, it will report error. Just return
+-  // false and then this work can be finished by handleFixup.
+-  if (&SecA != &SecB)
+-    return false;
+-
+-  // In SecA == SecB case. If the linker relaxation is enabled, we need record
+-  // the ADD, SUB relocations. Otherwise the FixedValue has already been
+-  // calculated out in evaluateFixup, return true and avoid record relocations.
+-  if (!STI.hasFeature(LoongArch::FeatureRelax))
+-    return true;
++  const MCSymbol &SA = Target.getSymA()->getSymbol();
++  const MCSymbol &SB = Target.getSymB()->getSymbol();
++
++  bool force = !SA.isInSection() || !SB.isInSection();
++  if (!force) {
++    const MCSection &SecA = SA.getSection();
++    const MCSection &SecB = SB.getSection();
++
++    // We need record relocation if SecA != SecB. Usually SecB is same as the
++    // section of Fixup, which will be record the relocation as PCRel. If SecB
++    // is not same as the section of Fixup, it will report error. Just return
++    // false and then this work can be finished by handleFixup.
++    if (&SecA != &SecB)
++      return false;
++
++    // In SecA == SecB case. If the linker relaxation is enabled, we need record
++    // the ADD, SUB relocations. Otherwise the FixedValue has already been calc-
++    // ulated out in evaluateFixup, return true and avoid record relocations.
++    if (!STI.hasFeature(LoongArch::FeatureRelax))
++      return true;
++  }
+ 
+   switch (Fixup.getKind()) {
+   case llvm::FK_Data_1:
+@@ -255,6 +291,9 @@ bool LoongArchAsmBackend::handleAddSubRelocations(const MCAsmLayout &Layout,
+   case llvm::FK_Data_8:
+     FK = getRelocPairForSize(64);
+     break;
++  case llvm::FK_Data_leb128:
++    FK = getRelocPairForSize(128);
++    break;
+   default:
+     llvm_unreachable("unsupported fixup size");
+   }
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+index 20f25b5cf53b..49801e4fd81a 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+@@ -65,6 +65,9 @@ public:
+   void relaxInstruction(MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override {}
+ 
++  std::pair<bool, bool> relaxLEB128(MCLEBFragment &LF, MCAsmLayout &Layout,
++                                    int64_t &Value) const override;
++
+   bool writeNopData(raw_ostream &OS, uint64_t Count,
+                     const MCSubtargetInfo *STI) const override;
+ 
+diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+index 1b890fbe041a..5c651aa93225 100644
+--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
++++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+@@ -19,6 +19,7 @@
+ #include "llvm/MC/MCObjectWriter.h"
+ #include "llvm/MC/MCSymbol.h"
+ #include "llvm/MC/MCValue.h"
++#include "llvm/Support/CommandLine.h"
+ #include "llvm/Support/Endian.h"
+ #include "llvm/Support/EndianStream.h"
+ #include "llvm/Support/ErrorHandling.h"
+@@ -27,6 +28,13 @@
+ 
+ using namespace llvm;
+ 
++// Temporary workaround for old linkers that do not support ULEB128 relocations,
++// which are abused by DWARF v5 DW_LLE_offset_pair/DW_RLE_offset_pair
++// implemented in Clang/LLVM.
++static cl::opt<bool> ULEB128Reloc(
++    "riscv-uleb128-reloc", cl::init(false), cl::Hidden,
++    cl::desc("Emit R_RISCV_SET_ULEB128/E_RISCV_SUB_ULEB128 if appropriate"));
++
+ std::optional<MCFixupKind> RISCVAsmBackend::getFixupKind(StringRef Name) const {
+   if (STI.getTargetTriple().isOSBinFormatELF()) {
+     unsigned Type;
+@@ -126,6 +134,7 @@ bool RISCVAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+   case FK_Data_2:
+   case FK_Data_4:
+   case FK_Data_8:
++  case FK_Data_leb128:
+     if (Target.isAbsolute())
+       return false;
+     break;
+@@ -330,6 +339,19 @@ bool RISCVAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
+   return true;
+ }
+ 
++std::pair<bool, bool> RISCVAsmBackend::relaxLEB128(MCLEBFragment &LF,
++                                                   MCAsmLayout &Layout,
++                                                   int64_t &Value) const {
++  if (LF.isSigned())
++    return std::make_pair(false, false);
++  const MCExpr &Expr = LF.getValue();
++  if (ULEB128Reloc) {
++    LF.getFixups().push_back(
++        MCFixup::create(0, &Expr, FK_Data_leb128, Expr.getLoc()));
++  }
++  return std::make_pair(Expr.evaluateKnownAbsolute(Value, Layout), false);
++}
++
+ // Given a compressed control flow instruction this function returns
+ // the expanded instruction.
+ unsigned RISCVAsmBackend::getRelaxedOpcode(unsigned Op) const {
+@@ -416,6 +438,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+   case FK_Data_4:
+   case FK_Data_8:
+   case FK_Data_6b:
++  case FK_Data_leb128:
+     return Value;
+   case RISCV::fixup_riscv_set_6b:
+     return Value & 0x03;
+@@ -596,6 +619,10 @@ bool RISCVAsmBackend::handleAddSubRelocations(const MCAsmLayout &Layout,
+     TA = ELF::R_RISCV_ADD64;
+     TB = ELF::R_RISCV_SUB64;
+     break;
++  case llvm::FK_Data_leb128:
++    TA = ELF::R_RISCV_SET_ULEB128;
++    TB = ELF::R_RISCV_SUB_ULEB128;
++    break;
+   default:
+     llvm_unreachable("unsupported fixup size");
+   }
+diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+index 0ea1f32e8296..edefb171bcdc 100644
+--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
++++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+@@ -99,6 +99,8 @@ public:
+                           bool &WasRelaxed) const override;
+   bool relaxDwarfCFA(MCDwarfCallFrameFragment &DF, MCAsmLayout &Layout,
+                      bool &WasRelaxed) const override;
++  std::pair<bool, bool> relaxLEB128(MCLEBFragment &LF, MCAsmLayout &Layout,
++                                    int64_t &Value) const override;
+ 
+   bool writeNopData(raw_ostream &OS, uint64_t Count,
+                     const MCSubtargetInfo *STI) const override;
+diff --git a/llvm/test/MC/ELF/RISCV/gen-dwarf.s b/llvm/test/MC/ELF/RISCV/gen-dwarf.s
+index 2235559d5f35..2a7dc777e70c 100644
+--- a/llvm/test/MC/ELF/RISCV/gen-dwarf.s
++++ b/llvm/test/MC/ELF/RISCV/gen-dwarf.s
+@@ -9,7 +9,7 @@
+ ## emit special opcodes to make .debug_line smaller, but we don't do this for
+ ## consistency.
+ 
+-# RUN: llvm-mc -filetype=obj -triple=riscv64 -g -dwarf-version=5 -mattr=+relax < %s -o %t
++# RUN: llvm-mc -filetype=obj -triple=riscv64 -g -dwarf-version=5 -mattr=+relax -riscv-uleb128-reloc=1 < %s -o %t
+ # RUN: llvm-dwarfdump -eh-frame -debug-line -debug-rnglists -v %t | FileCheck %s
+ # RUN: llvm-readobj -r -x .eh_frame %t | FileCheck %s --check-prefix=RELOC
+ 
+@@ -48,9 +48,10 @@
+ # RELOC-NEXT:   0x34 R_RISCV_32_PCREL <null> 0x0
+ # RELOC-NEXT: }
+ 
+-## TODO A section needs two relocations.
+ # RELOC:      Section ([[#]]) .rela.debug_rnglists {
+ # RELOC-NEXT:   0xD R_RISCV_64 .text.foo 0x0
++# RELOC-NEXT:   0x15 R_RISCV_SET_ULEB128 <null> 0x0
++# RELOC-NEXT:   0x15 R_RISCV_SUB_ULEB128 .text.foo 0x0
+ # RELOC-NEXT:   0x17 R_RISCV_64 .text.bar 0x0
+ # RELOC-NEXT: }
+ 
+diff --git a/llvm/test/MC/LoongArch/Relocations/leb128.s b/llvm/test/MC/LoongArch/Relocations/leb128.s
+new file mode 100644
+index 000000000000..7a96ec551b76
+--- /dev/null
++++ b/llvm/test/MC/LoongArch/Relocations/leb128.s
+@@ -0,0 +1,72 @@
++# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o %t
++# RUN: llvm-readobj -r -x .alloc_w %t | FileCheck --check-prefixes=CHECK,NORELAX %s
++# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.relax
++# RUN: llvm-readobj -r -x .alloc_w %t.relax | FileCheck --check-prefixes=CHECK,RELAX %s
++
++# RUN: not llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax --defsym ERR=1 %s -o /dev/null 2>&1 | \
++# RUN:   FileCheck %s --check-prefix=ERR
++# RUN: not llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax --defsym ERR=1 %s -o /dev/null 2>&1 | \
++# RUN:   FileCheck %s --check-prefix=ERR
++
++# CHECK:      Relocations [
++# CHECK-NEXT:   .rela.alloc_w {
++# RELAX-NEXT:      0x0 R_LARCH_ADD_ULEB128 w1 0x0
++# RELAX-NEXT:      0x0 R_LARCH_SUB_ULEB128 w 0x0
++# RELAX-NEXT:      0x1 R_LARCH_ADD_ULEB128 w2 0x0
++# RELAX-NEXT:      0x1 R_LARCH_SUB_ULEB128 w1 0x0
++# CHECK-NEXT:      0x2 R_LARCH_PCALA_HI20 foo 0x0
++# RELAX-NEXT:      0x2 R_LARCH_RELAX - 0x0
++# CHECK-NEXT:      0x6 R_LARCH_PCALA_LO12 foo 0x0
++# RELAX-NEXT:      0x6 R_LARCH_RELAX - 0x0
++# RELAX-NEXT:      0xA R_LARCH_ADD_ULEB128 w2 0x0
++# RELAX-NEXT:      0xA R_LARCH_SUB_ULEB128 w1 0x0
++# RELAX-NEXT:      0xB R_LARCH_ADD_ULEB128 w2 0x78
++# RELAX-NEXT:      0xB R_LARCH_SUB_ULEB128 w1 0x0
++# RELAX-NEXT:      0xD R_LARCH_ADD_ULEB128 w1 0x0
++# RELAX-NEXT:      0xD R_LARCH_SUB_ULEB128 w2 0x0
++# RELAX-NEXT:      0x17 R_LARCH_ADD_ULEB128 w3 0x6F
++# RELAX-NEXT:      0x17 R_LARCH_SUB_ULEB128 w2 0x0
++# RELAX-NEXT:      0x18 R_LARCH_ADD_ULEB128 w3 0x71
++# RELAX-NEXT:      0x18 R_LARCH_SUB_ULEB128 w2 0x0
++# CHECK-NEXT:   }
++# CHECK-NEXT: ]
++
++# CHECK:        Hex dump of section '.alloc_w':
++# NORELAX-NEXT: 0x00000000 02080c00 001a8c01 c0020880 01f8ffff
++# NORELAX-NEXT: 0x00000010 ffffffff ffff017f 8101
++# RELAX-NEXT:   0x00000000 00000c00 001a8c01 c0020080 00808080
++# RELAX-NEXT:   0x00000010 80808080 80800000 8000
++
++.section .alloc_w,"ax",@progbits; w:
++.uleb128 w1-w       # w1 is later defined in the same section
++.uleb128 w2-w1      # w1 and w2 are separated by a linker relaxable instruction
++w1:
++  la.pcrel $t0, foo
++w2:
++.uleb128 w2-w1      # 0x08
++.uleb128 w2-w1+120  # 0x0180
++.uleb128 -(w2-w1)   # 0x01fffffffffffffffff8
++.uleb128 w3-w2+111  # 0x7f
++.uleb128 w3-w2+113  # 0x0181
++w3:
++
++.ifdef ERR
++# ERR: :[[#@LINE+1]]:16: error: .uleb128 expression is not absolute
++.uleb128 extern-w   # extern is undefined
++# ERR: :[[#@LINE+1]]:11: error: .uleb128 expression is not absolute
++.uleb128 w-extern
++# ERR: :[[#@LINE+1]]:11: error: .uleb128 expression is not absolute
++.uleb128 x-w        # x is later defined in another section
++
++.section .alloc_x,"aw",@progbits; x:
++# ERR: :[[#@LINE+1]]:11: error: .uleb128 expression is not absolute
++.uleb128 y-x
++.section .alloc_y,"aw",@progbits; y:
++# ERR: :[[#@LINE+1]]:11: error: .uleb128 expression is not absolute
++.uleb128 x-y
++
++# ERR: :[[#@LINE+1]]:10: error: .uleb128 expression is not absolute
++.uleb128 extern
++# ERR: :[[#@LINE+1]]:10: error: .uleb128 expression is not absolute
++.uleb128 y
++.endif
+diff --git a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
+index 14922657ae89..cd01332afd0b 100644
+--- a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
++++ b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
+@@ -8,12 +8,23 @@
+ # NORELAX-NEXT:      0x10 R_LARCH_PCALA_HI20 .text 0x0
+ # NORELAX-NEXT:      0x14 R_LARCH_PCALA_LO12 .text 0x0
+ # NORELAX-NEXT:    }
++# NORELAX-NEXT:    Section ({{.*}}) .rela.data {
++# NORELAX-NEXT:      0x30 R_LARCH_ADD8 foo 0x0
++# NORELAX-NEXT:      0x30 R_LARCH_SUB8 .text 0x10
++# NORELAX-NEXT:      0x31 R_LARCH_ADD16 foo 0x0
++# NORELAX-NEXT:      0x31 R_LARCH_SUB16 .text 0x10
++# NORELAX-NEXT:      0x33 R_LARCH_ADD32 foo 0x0
++# NORELAX-NEXT:      0x33 R_LARCH_SUB32 .text 0x10
++# NORELAX-NEXT:      0x37 R_LARCH_ADD64 foo 0x0
++# NORELAX-NEXT:      0x37 R_LARCH_SUB64 .text 0x10
++# NORELAX-NEXT:    }
+ # NORELAX-NEXT:  ]
+ 
+ # NORELAX:      Hex dump of section '.data':
+-# NORELAX-NEXT: 0x00000000 04040004 00000004 00000000 0000000c
+-# NORELAX-NEXT: 0x00000010 0c000c00 00000c00 00000000 00000808
+-# NORELAX-NEXT: 0x00000020 00080000 00080000 00000000 00
++# NORELAX-NEXT: 0x00000000 04040004 00000004 00000000 00000004
++# NORELAX-NEXT: 0x00000010 0c0c000c 0000000c 00000000 0000000c
++# NORELAX-NEXT: 0x00000020 08080008 00000008 00000000 00000008
++# NORELAX-NEXT: 0x00000030 00000000 00000000 00000000 000000
+ 
+ # RELAX:       Relocations [
+ # RELAX-NEXT:    Section ({{.*}}) .rela.text {
+@@ -23,21 +34,32 @@
+ # RELAX-NEXT:      0x14 R_LARCH_RELAX - 0x0
+ # RELAX-NEXT:    }
+ # RELAX-NEXT:    Section ({{.*}}) .rela.data {
+-# RELAX-NEXT:      0x1E R_LARCH_ADD8 .L4 0x0
+-# RELAX-NEXT:      0x1E R_LARCH_SUB8 .L3 0x0
+-# RELAX-NEXT:      0x1F R_LARCH_ADD16 .L4 0x0
+-# RELAX-NEXT:      0x1F R_LARCH_SUB16 .L3 0x0
+-# RELAX-NEXT:      0x21 R_LARCH_ADD32 .L4 0x0
+-# RELAX-NEXT:      0x21 R_LARCH_SUB32 .L3 0x0
+-# RELAX-NEXT:      0x25 R_LARCH_ADD64 .L4 0x0
+-# RELAX-NEXT:      0x25 R_LARCH_SUB64 .L3 0x0
++# RELAX-NEXT:      0x20 R_LARCH_ADD8 .L4 0x0
++# RELAX-NEXT:      0x20 R_LARCH_SUB8 .L3 0x0
++# RELAX-NEXT:      0x21 R_LARCH_ADD16 .L4 0x0
++# RELAX-NEXT:      0x21 R_LARCH_SUB16 .L3 0x0
++# RELAX-NEXT:      0x23 R_LARCH_ADD32 .L4 0x0
++# RELAX-NEXT:      0x23 R_LARCH_SUB32 .L3 0x0
++# RELAX-NEXT:      0x27 R_LARCH_ADD64 .L4 0x0
++# RELAX-NEXT:      0x27 R_LARCH_SUB64 .L3 0x0
++# RELAX-NEXT:      0x2F R_LARCH_ADD_ULEB128 .L4 0x0
++# RELAX-NEXT:      0x2F R_LARCH_SUB_ULEB128 .L3 0x0
++# RELAX-NEXT:      0x30 R_LARCH_ADD8 foo 0x0
++# RELAX-NEXT:      0x30 R_LARCH_SUB8 .L3 0x0
++# RELAX-NEXT:      0x31 R_LARCH_ADD16 foo 0x0
++# RELAX-NEXT:      0x31 R_LARCH_SUB16 .L3 0x0
++# RELAX-NEXT:      0x33 R_LARCH_ADD32 foo 0x0
++# RELAX-NEXT:      0x33 R_LARCH_SUB32 .L3 0x0
++# RELAX-NEXT:      0x37 R_LARCH_ADD64 foo 0x0
++# RELAX-NEXT:      0x37 R_LARCH_SUB64 .L3 0x0
+ # RELAX-NEXT:    }
+ # RELAX-NEXT:  ]
+ 
+ # RELAX:      Hex dump of section '.data':
+-# RELAX-NEXT: 0x00000000 04040004 00000004 00000000 0000000c
+-# RELAX-NEXT: 0x00000010 0c000c00 00000c00 00000000 00000000
+-# RELAX-NEXT: 0x00000020 00000000 00000000 00000000 00
++# RELAX-NEXT: 0x00000000 04040004 00000004 00000000 00000004
++# RELAX-NEXT: 0x00000010 0c0c000c 0000000c 00000000 0000000c
++# RELAX-NEXT: 0x00000020 00000000 00000000 00000000 00000000
++# RELAX-NEXT: 0x00000030 00000000 00000000 00000000 000000
+ 
+ .text
+ .L1:
+@@ -55,13 +77,20 @@
+ .short .L2 - .L1
+ .word  .L2 - .L1
+ .dword .L2 - .L1
++.uleb128 .L2 - .L1
+ ## TODO Handle alignment directive.
+ .byte  .L3 - .L2
+ .short .L3 - .L2
+ .word  .L3 - .L2
+ .dword .L3 - .L2
++.uleb128 .L3 - .L2
+ ## With relaxation, emit relocs because the la.pcrel makes the diff variable.
+ .byte  .L4 - .L3
+ .short .L4 - .L3
+ .word  .L4 - .L3
+ .dword .L4 - .L3
++.uleb128 .L4 - .L3
++.byte  foo - .L3
++.short foo - .L3
++.word  foo - .L3
++.dword foo - .L3
+diff --git a/llvm/test/MC/X86/invalid-sleb.s b/llvm/test/MC/X86/invalid-sleb.s
+deleted file mode 100644
+index 7d7df351ce4e..000000000000
+--- a/llvm/test/MC/X86/invalid-sleb.s
++++ /dev/null
+@@ -1,5 +0,0 @@
+-// RUN: not --crash llvm-mc -filetype=obj -triple x86_64-pc-linux %s -o %t 2>&1 | FileCheck %s
+-
+-// CHECK:  sleb128 and uleb128 expressions must be absolute
+-
+-        .sleb128 undefined
+-- 
+2.20.1
+
diff --git a/0006-Backport-LoongArch-Add-relaxDwarfLineAddr-and-relaxDwarfCFA-to-handle-the-mutable-label-diff-in-dwarfinfo.patch b/0006-Backport-LoongArch-Add-relaxDwarfLineAddr-and-relaxDwarfCFA-to-handle-the-mutable-label-diff-in-dwarfinfo.patch
new file mode 100644
index 0000000..4d19f8c
--- /dev/null
+++ b/0006-Backport-LoongArch-Add-relaxDwarfLineAddr-and-relaxDwarfCFA-to-handle-the-mutable-label-diff-in-dwarfinfo.patch
@@ -0,0 +1,376 @@
+From 286c92a8e78c4b67368c2f47a8e73036fdacbae2 Mon Sep 17 00:00:00 2001
+From: Jinyang He <hejinyang@loongson.cn>
+Date: Tue, 16 Jan 2024 13:20:13 +0800
+Subject: [PATCH 07/14] [LoongArch] Add relaxDwarfLineAddr and relaxDwarfCFA to
+ handle the mutable label diff in dwarfinfo (#77728)
+
+When linker-relaxation is enabled, part of the label diff in dwarfinfo
+cannot be computed before static link. Refer to RISCV, we add the
+relaxDwarfLineAddr and relaxDwarfCFA to add relocations for these label
+diffs. Calculate whether the label diff is mutable. For immutable label
+diff, return false and do the other works by its parent function.
+
+(cherry picked from commit ed7f4edc19ada006789318a0929b57d1b5a761bd)
+Change-Id: Iae5bad958c6d1a71dac1672f5f03991eaeea6d22
+---
+ llvm/lib/Object/RelocationResolver.cpp        |  12 +-
+ .../MCTargetDesc/LoongArchAsmBackend.cpp      | 129 ++++++++++++++++++
+ .../MCTargetDesc/LoongArchAsmBackend.h        |   5 +
+ .../LoongArch/dwarf-loongarch-relocs.ll       | 128 +++++++++++++++++
+ llvm/test/DebugInfo/LoongArch/lit.local.cfg   |   2 +
+ 5 files changed, 274 insertions(+), 2 deletions(-)
+ create mode 100644 llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
+ create mode 100644 llvm/test/DebugInfo/LoongArch/lit.local.cfg
+
+diff --git a/llvm/lib/Object/RelocationResolver.cpp b/llvm/lib/Object/RelocationResolver.cpp
+index 03ac59289528..0e5036d7dfcc 100644
+--- a/llvm/lib/Object/RelocationResolver.cpp
++++ b/llvm/lib/Object/RelocationResolver.cpp
+@@ -539,6 +539,8 @@ static bool supportsLoongArch(uint64_t Type) {
+   case ELF::R_LARCH_32:
+   case ELF::R_LARCH_32_PCREL:
+   case ELF::R_LARCH_64:
++  case ELF::R_LARCH_ADD6:
++  case ELF::R_LARCH_SUB6:
+   case ELF::R_LARCH_ADD8:
+   case ELF::R_LARCH_SUB8:
+   case ELF::R_LARCH_ADD16:
+@@ -564,6 +566,10 @@ static uint64_t resolveLoongArch(uint64_t Type, uint64_t Offset, uint64_t S,
+     return (S + Addend - Offset) & 0xFFFFFFFF;
+   case ELF::R_LARCH_64:
+     return S + Addend;
++  case ELF::R_LARCH_ADD6:
++    return (LocData & 0xC0) | ((LocData + S + Addend) & 0x3F);
++  case ELF::R_LARCH_SUB6:
++    return (LocData & 0xC0) | ((LocData - (S + Addend)) & 0x3F);
+   case ELF::R_LARCH_ADD8:
+     return (LocData + (S + Addend)) & 0xFF;
+   case ELF::R_LARCH_SUB8:
+@@ -880,8 +886,10 @@ uint64_t resolveRelocation(RelocationResolver Resolver, const RelocationRef &R,
+ 
+       if (GetRelSectionType() == ELF::SHT_RELA) {
+         Addend = getELFAddend(R);
+-        // RISCV relocations use both LocData and Addend.
+-        if (Obj->getArch() != Triple::riscv32 &&
++        // LoongArch and RISCV relocations use both LocData and Addend.
++        if (Obj->getArch() != Triple::loongarch32 &&
++            Obj->getArch() != Triple::loongarch64 &&
++            Obj->getArch() != Triple::riscv32 &&
+             Obj->getArch() != Triple::riscv64)
+           LocData = 0;
+       }
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+index 9227d4d6afed..8d82327b2e2b 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+@@ -12,6 +12,7 @@
+ 
+ #include "LoongArchAsmBackend.h"
+ #include "LoongArchFixupKinds.h"
++#include "llvm/MC/MCAsmInfo.h"
+ #include "llvm/MC/MCAsmLayout.h"
+ #include "llvm/MC/MCAssembler.h"
+ #include "llvm/MC/MCContext.h"
+@@ -19,6 +20,7 @@
+ #include "llvm/MC/MCValue.h"
+ #include "llvm/Support/Endian.h"
+ #include "llvm/Support/EndianStream.h"
++#include "llvm/Support/LEB128.h"
+ 
+ #define DEBUG_TYPE "loongarch-asmbackend"
+ 
+@@ -235,6 +237,133 @@ std::pair<bool, bool> LoongArchAsmBackend::relaxLEB128(MCLEBFragment &LF,
+   return std::make_pair(true, true);
+ }
+ 
++bool LoongArchAsmBackend::relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF,
++                                             MCAsmLayout &Layout,
++                                             bool &WasRelaxed) const {
++  MCContext &C = Layout.getAssembler().getContext();
++
++  int64_t LineDelta = DF.getLineDelta();
++  const MCExpr &AddrDelta = DF.getAddrDelta();
++  SmallVectorImpl<char> &Data = DF.getContents();
++  SmallVectorImpl<MCFixup> &Fixups = DF.getFixups();
++  size_t OldSize = Data.size();
++
++  int64_t Value;
++  if (AddrDelta.evaluateAsAbsolute(Value, Layout))
++    return false;
++  bool IsAbsolute = AddrDelta.evaluateKnownAbsolute(Value, Layout);
++  assert(IsAbsolute && "CFA with invalid expression");
++  (void)IsAbsolute;
++
++  Data.clear();
++  Fixups.clear();
++  raw_svector_ostream OS(Data);
++
++  // INT64_MAX is a signal that this is actually a DW_LNE_end_sequence.
++  if (LineDelta != INT64_MAX) {
++    OS << uint8_t(dwarf::DW_LNS_advance_line);
++    encodeSLEB128(LineDelta, OS);
++  }
++
++  unsigned Offset;
++  std::pair<MCFixupKind, MCFixupKind> FK;
++
++  // According to the DWARF specification, the `DW_LNS_fixed_advance_pc` opcode
++  // takes a single unsigned half (unencoded) operand. The maximum encodable
++  // value is therefore 65535.  Set a conservative upper bound for relaxation.
++  if (Value > 60000) {
++    unsigned PtrSize = C.getAsmInfo()->getCodePointerSize();
++
++    OS << uint8_t(dwarf::DW_LNS_extended_op);
++    encodeULEB128(PtrSize + 1, OS);
++
++    OS << uint8_t(dwarf::DW_LNE_set_address);
++    Offset = OS.tell();
++    assert((PtrSize == 4 || PtrSize == 8) && "Unexpected pointer size");
++    FK = getRelocPairForSize(PtrSize == 4 ? 32 : 64);
++    OS.write_zeros(PtrSize);
++  } else {
++    OS << uint8_t(dwarf::DW_LNS_fixed_advance_pc);
++    Offset = OS.tell();
++    FK = getRelocPairForSize(16);
++    support::endian::write<uint16_t>(OS, 0, support::little);
++  }
++
++  const MCBinaryExpr &MBE = cast<MCBinaryExpr>(AddrDelta);
++  Fixups.push_back(MCFixup::create(Offset, MBE.getLHS(), std::get<0>(FK)));
++  Fixups.push_back(MCFixup::create(Offset, MBE.getRHS(), std::get<1>(FK)));
++
++  if (LineDelta == INT64_MAX) {
++    OS << uint8_t(dwarf::DW_LNS_extended_op);
++    OS << uint8_t(1);
++    OS << uint8_t(dwarf::DW_LNE_end_sequence);
++  } else {
++    OS << uint8_t(dwarf::DW_LNS_copy);
++  }
++
++  WasRelaxed = OldSize != Data.size();
++  return true;
++}
++
++bool LoongArchAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,
++                                        MCAsmLayout &Layout,
++                                        bool &WasRelaxed) const {
++  const MCExpr &AddrDelta = DF.getAddrDelta();
++  SmallVectorImpl<char> &Data = DF.getContents();
++  SmallVectorImpl<MCFixup> &Fixups = DF.getFixups();
++  size_t OldSize = Data.size();
++
++  int64_t Value;
++  if (AddrDelta.evaluateAsAbsolute(Value, Layout))
++    return false;
++  bool IsAbsolute = AddrDelta.evaluateKnownAbsolute(Value, Layout);
++  assert(IsAbsolute && "CFA with invalid expression");
++  (void)IsAbsolute;
++
++  Data.clear();
++  Fixups.clear();
++  raw_svector_ostream OS(Data);
++
++  assert(
++      Layout.getAssembler().getContext().getAsmInfo()->getMinInstAlignment() ==
++          1 &&
++      "expected 1-byte alignment");
++  if (Value == 0) {
++    WasRelaxed = OldSize != Data.size();
++    return true;
++  }
++
++  auto AddFixups = [&Fixups,
++                    &AddrDelta](unsigned Offset,
++                                std::pair<MCFixupKind, MCFixupKind> FK) {
++    const MCBinaryExpr &MBE = cast<MCBinaryExpr>(AddrDelta);
++    Fixups.push_back(MCFixup::create(Offset, MBE.getLHS(), std::get<0>(FK)));
++    Fixups.push_back(MCFixup::create(Offset, MBE.getRHS(), std::get<1>(FK)));
++  };
++
++  if (isUIntN(6, Value)) {
++    OS << uint8_t(dwarf::DW_CFA_advance_loc);
++    AddFixups(0, getRelocPairForSize(6));
++  } else if (isUInt<8>(Value)) {
++    OS << uint8_t(dwarf::DW_CFA_advance_loc1);
++    support::endian::write<uint8_t>(OS, 0, support::little);
++    AddFixups(1, getRelocPairForSize(8));
++  } else if (isUInt<16>(Value)) {
++    OS << uint8_t(dwarf::DW_CFA_advance_loc2);
++    support::endian::write<uint16_t>(OS, 0, support::little);
++    AddFixups(1, getRelocPairForSize(16));
++  } else if (isUInt<32>(Value)) {
++    OS << uint8_t(dwarf::DW_CFA_advance_loc4);
++    support::endian::write<uint32_t>(OS, 0, support::little);
++    AddFixups(1, getRelocPairForSize(32));
++  } else {
++    llvm_unreachable("unsupported CFA encoding");
++  }
++
++  WasRelaxed = OldSize != Data.size();
++  return true;
++}
++
+ bool LoongArchAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
+                                        const MCSubtargetInfo *STI) const {
+   // We mostly follow binutils' convention here: align to 4-byte boundary with a
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+index 49801e4fd81a..657f5ca5e731 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+@@ -68,6 +68,11 @@ public:
+   std::pair<bool, bool> relaxLEB128(MCLEBFragment &LF, MCAsmLayout &Layout,
+                                     int64_t &Value) const override;
+ 
++  bool relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF, MCAsmLayout &Layout,
++                          bool &WasRelaxed) const override;
++  bool relaxDwarfCFA(MCDwarfCallFrameFragment &DF, MCAsmLayout &Layout,
++                     bool &WasRelaxed) const override;
++
+   bool writeNopData(raw_ostream &OS, uint64_t Count,
+                     const MCSubtargetInfo *STI) const override;
+ 
+diff --git a/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll b/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
+new file mode 100644
+index 000000000000..e03b4c1d34de
+--- /dev/null
++++ b/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
+@@ -0,0 +1,128 @@
++; RUN: llc --filetype=obj --mtriple=loongarch64 --mattr=-relax %s -o %t.o
++; RUN: llvm-readobj -r %t.o | FileCheck --check-prefixes=RELOCS-BOTH,RELOCS-NORL %s
++; RUN: llvm-objdump --source %t.o | FileCheck --check-prefix=SOURCE %s
++; RUN: llvm-dwarfdump --debug-info --debug-line %t.o | FileCheck --check-prefix=DWARF %s
++
++; RUN: llc --filetype=obj --mtriple=loongarch64 --mattr=+relax %s -o %t.r.o
++; RUN: llvm-readobj -r %t.r.o | FileCheck --check-prefixes=RELOCS-BOTH,RELOCS-ENRL %s
++; RUN: llvm-objdump --source %t.r.o | FileCheck --check-prefix=SOURCE %s
++; RUN: llvm-dwarfdump --debug-info --debug-line %t.r.o | FileCheck --check-prefix=DWARF %s
++
++; RELOCS-BOTH:       Relocations [
++; RELOCS-BOTH-NEXT:    Section ({{.*}}) .rela.text {
++; RELOCS-BOTH-NEXT:      0x14 R_LARCH_PCALA_HI20 sym 0x0
++; RELOCS-ENRL-NEXT:      0x14 R_LARCH_RELAX - 0x0
++; RELOCS-BOTH-NEXT:      0x18 R_LARCH_PCALA_LO12 sym 0x0
++; RELOCS-ENRL-NEXT:      0x18 R_LARCH_RELAX - 0x0
++; RELOCS-BOTH-NEXT:    }
++; RELOCS-BOTH:         Section ({{.*}}) .rela.debug_frame {
++; RELOCS-NORL-NEXT:      0x1C R_LARCH_32 .debug_frame 0x0
++; RELOCS-NORL-NEXT:      0x20 R_LARCH_64 .text 0x0
++; RELOCS-ENRL-NEXT:      0x1C R_LARCH_32 <null> 0x0
++; RELOCS-ENRL-NEXT:      0x20 R_LARCH_64 <null> 0x0
++; RELOCS-ENRL-NEXT:      0x28 R_LARCH_ADD64 <null> 0x0
++; RELOCS-ENRL-NEXT:      0x28 R_LARCH_SUB64 <null> 0x0
++; RELOCS-ENRL-NEXT:      0x3F R_LARCH_ADD6 <null> 0x0
++; RELOCS-ENRL-NEXT:      0x3F R_LARCH_SUB6 <null> 0x0
++; RELOCS-BOTH-NEXT:    }
++; RELOCS-BOTH:         Section ({{.*}}) .rela.debug_line {
++; RELOCS-BOTH-NEXT:      0x22 R_LARCH_32 .debug_line_str 0x0
++; RELOCS-BOTH-NEXT:      0x31 R_LARCH_32 .debug_line_str 0x2
++; RELOCS-BOTH-NEXT:      0x46 R_LARCH_32 .debug_line_str 0x1B
++; RELOCS-NORL-NEXT:      0x4F R_LARCH_64 .text 0x0
++; RELOCS-ENRL-NEXT:      0x4F R_LARCH_64 <null> 0x0
++; RELOCS-ENRL-NEXT:      0x5F R_LARCH_ADD16 <null> 0x0
++; RELOCS-ENRL-NEXT:      0x5F R_LARCH_SUB16 <null> 0x0
++; RELOCS-BOTH-NEXT:    }
++; RELOCS-BOTH-NEXT:  ]
++
++; SOURCE:  0000000000000000 <foo>:
++; SOURCE:  ; {
++; SOURCE:  ;   asm volatile(
++; SOURCE:  ;   return 0;
++
++; DWARF:       DW_AT_producer ("clang")
++; DWARF:       DW_AT_name ("dwarf-loongarch-relocs.c")
++; DWARF:       DW_AT_comp_dir (".")
++; DWARF:       DW_AT_name ("foo")
++; DWARF-NEXT:  DW_AT_decl_file ("{{.*}}dwarf-loongarch-relocs.c")
++; DWARF-NEXT:  DW_AT_decl_line (1)
++; DWARF-NEXT:  DW_AT_type (0x00000032 "int")
++; DWARF:       DW_AT_name ("int")
++; DWARF-NEXT:  DW_AT_encoding (DW_ATE_signed)
++; DWARF-NEXT:  DW_AT_byte_size (0x04)
++; DWARF:       .debug_line contents:
++; DWARF-NEXT:  debug_line[0x00000000]
++; DWARF-NEXT:  Line table prologue:
++; DWARF-NEXT:      total_length: {{.*}}
++; DWARF-NEXT:            format: DWARF32
++; DWARF-NEXT:           version: 5
++; DWARF-NEXT:      address_size: 8
++; DWARF-NEXT:   seg_select_size: 0
++; DWARF-NEXT:   prologue_length: 0x0000003e
++; DWARF-NEXT:   min_inst_length: 1
++; DWARF-NEXT:  max_ops_per_inst: 1
++; DWARF-NEXT:   default_is_stmt: 1
++; DWARF-NEXT:         line_base: -5
++; DWARF-NEXT:        line_range: 14
++; DWARF-NEXT:       opcode_base: 13
++; DWARF-NEXT:  standard_opcode_lengths[DW_LNS_copy] = 0
++; DWARF-NEXT:  standard_opcode_lengths[DW_LNS_advance_pc] = 1
++; DWARF-NEXT:  standard_opcode_lengths[DW_LNS_advance_line] = 1
++; DWARF-NEXT:  standard_opcode_lengths[DW_LNS_set_file] = 1
++; DWARF-NEXT:  standard_opcode_lengths[DW_LNS_set_column] = 1
++; DWARF-NEXT:  standard_opcode_lengths[DW_LNS_negate_stmt] = 0
++; DWARF-NEXT:  standard_opcode_lengths[DW_LNS_set_basic_block] = 0
++; DWARF-NEXT:  standard_opcode_lengths[DW_LNS_const_add_pc] = 0
++; DWARF-NEXT:  standard_opcode_lengths[DW_LNS_fixed_advance_pc] = 1
++; DWARF-NEXT:  standard_opcode_lengths[DW_LNS_set_prologue_end] = 0
++; DWARF-NEXT:  standard_opcode_lengths[DW_LNS_set_epilogue_begin] = 0
++; DWARF-NEXT:  standard_opcode_lengths[DW_LNS_set_isa] = 1
++; DWARF-NEXT:  include_directories[  0] = "."
++; DWARF-NEXT:  file_names[  0]:
++; DWARF-NEXT:             name: "dwarf-loongarch-relocs.c"
++; DWARF-NEXT:        dir_index: 0
++; DWARF-NEXT:     md5_checksum: f44d6d71bc4da58b4abe338ca507c007
++; DWARF-NEXT:           source: "{{.*}}"
++; DWARF-EMPTY:
++; DWARF-NEXT:  Address            Line   Column File   ISA Discriminator OpIndex Flags
++; DWARF-NEXT:  ------------------ ------ ------ ------ --- ------------- ------- -------------
++; DWARF-NEXT:  0x0000000000000000      2      0      0   0             0       0  is_stmt
++; DWARF-NEXT:  0x0000000000000010      3      3      0   0             0       0  is_stmt prologue_end
++; DWARF-NEXT:  0x0000000000000020     10      3      0   0             0       0  is_stmt
++; DWARF-NEXT:  0x000000000000002c     10      3      0   0             0       0  epilogue_begin
++; DWARF-NEXT:  0x0000000000000034     10      3      0   0             0       0  end_sequence
++
++; ModuleID = 'dwarf-loongarch-relocs.c'
++source_filename = "dwarf-loongarch-relocs.c"
++target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
++target triple = "loongarch64"
++
++; Function Attrs: noinline nounwind optnone
++define dso_local signext i32 @foo() #0 !dbg !8 {
++  call void asm sideeffect ".cfi_remember_state\0A\09.cfi_adjust_cfa_offset 16\0A\09nop\0A\09la.pcrel $$t0, sym\0A\09nop\0A\09.cfi_restore_state\0A\09", ""() #1, !dbg !12, !srcloc !13
++  ret i32 0, !dbg !14
++}
++
++attributes #0 = { noinline nounwind optnone "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="loongarch64" "target-features"="+64bit,+d,+f,+ual" }
++attributes #1 = { nounwind }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!2, !3, !4, !5, !6}
++!llvm.ident = !{!7}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
++!1 = !DIFile(filename: "dwarf-loongarch-relocs.c", directory: ".", checksumkind: CSK_MD5, checksum: "f44d6d71bc4da58b4abe338ca507c007", source: "int foo()\0A{\0A  asm volatile(\0A    \22.cfi_remember_state\\n\\t\22\0A    \22.cfi_adjust_cfa_offset 16\\n\\t\22\0A    \22nop\\n\\t\22\0A    \22la.pcrel $t0, sym\\n\\t\22\0A    \22nop\\n\\t\22\0A    \22.cfi_restore_state\\n\\t\22);\0A  return 0;\0A}\0A")
++!2 = !{i32 7, !"Dwarf Version", i32 5}
++!3 = !{i32 2, !"Debug Info Version", i32 3}
++!4 = !{i32 1, !"wchar_size", i32 4}
++!5 = !{i32 7, !"direct-access-external-data", i32 0}
++!6 = !{i32 7, !"frame-pointer", i32 2}
++!7 = !{!"clang"}
++!8 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !9, scopeLine: 2, spFlags: DISPFlagDefinition, unit: !0)
++!9 = !DISubroutineType(types: !10)
++!10 = !{!11}
++!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
++!12 = !DILocation(line: 3, column: 3, scope: !8)
++!13 = !{i64 34, i64 56, i64 92, i64 106, i64 134, i64 148, i64 177}
++!14 = !DILocation(line: 10, column: 3, scope: !8)
+diff --git a/llvm/test/DebugInfo/LoongArch/lit.local.cfg b/llvm/test/DebugInfo/LoongArch/lit.local.cfg
+new file mode 100644
+index 000000000000..77becb8eee90
+--- /dev/null
++++ b/llvm/test/DebugInfo/LoongArch/lit.local.cfg
+@@ -0,0 +1,2 @@
++if "LoongArch" not in config.root.targets:
++    config.unsupported = True
+-- 
+2.20.1
+
diff --git a/0007-Backport-LoongArch-Insert-nops-and-emit-align-reloc-when-handle-alignment-directive.patch b/0007-Backport-LoongArch-Insert-nops-and-emit-align-reloc-when-handle-alignment-directive.patch
new file mode 100644
index 0000000..9d027af
--- /dev/null
+++ b/0007-Backport-LoongArch-Insert-nops-and-emit-align-reloc-when-handle-alignment-directive.patch
@@ -0,0 +1,362 @@
+From 87f6adc2acf635a0a4c294217fb54c55eee3a06c Mon Sep 17 00:00:00 2001
+From: Jinyang He <hejinyang@loongson.cn>
+Date: Wed, 24 Jan 2024 09:17:49 +0800
+Subject: [PATCH 08/14] [LoongArch] Insert nops and emit align reloc when
+ handle alignment directive (#72962)
+
+Refer to RISCV, we will fix up the alignment if linker relaxation
+changes code size and breaks alignment. Insert enough Nops and emit
+R_LARCH_ALIGN relocation type so that linker could satisfy the alignment
+by removing Nops.
+It does so only in sections with the SHF_EXECINSTR flag.
+
+In LoongArch psABI v2.30, R_LARCH_ALIGN requires symbol index. The
+lowest 8 bits of addend represent alignment and the other bits of addend
+represent the maximum number of bytes to emit.
+
+(cherry picked from commit c51ab483e6c2d991a01179584705b83fbea1940d)
+Change-Id: Iba30702c9dda378acfae0b1f1134926fa838a368
+---
+ llvm/lib/MC/MCExpr.cpp                        |  2 +-
+ .../MCTargetDesc/LoongArchAsmBackend.cpp      | 67 ++++++++++++++++
+ .../MCTargetDesc/LoongArchAsmBackend.h        | 15 ++++
+ .../MCTargetDesc/LoongArchFixupKinds.h        |  4 +-
+ .../Relocations/align-non-executable.s        | 27 +++++++
+ .../MC/LoongArch/Relocations/relax-addsub.s   | 15 +++-
+ .../MC/LoongArch/Relocations/relax-align.s    | 79 +++++++++++++++++++
+ 7 files changed, 205 insertions(+), 4 deletions(-)
+ create mode 100644 llvm/test/MC/LoongArch/Relocations/align-non-executable.s
+ create mode 100644 llvm/test/MC/LoongArch/Relocations/relax-align.s
+
+diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
+index a561fed11179..79808a58d81c 100644
+--- a/llvm/lib/MC/MCExpr.cpp
++++ b/llvm/lib/MC/MCExpr.cpp
+@@ -711,7 +711,7 @@ static void AttemptToFoldSymbolOffsetDifference(
+       if (DF) {
+         Displacement += DF->getContents().size();
+       } else if (auto *AF = dyn_cast<MCAlignFragment>(FI);
+-                 AF && Layout &&
++                 AF && Layout && AF->hasEmitNops() &&
+                  !Asm->getBackend().shouldInsertExtraNopBytesForCodeAlign(
+                      *AF, Count)) {
+         Displacement += Asm->computeFragmentSize(*Layout, *AF);
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+index 8d82327b2e2b..8c482356402f 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+@@ -17,10 +17,13 @@
+ #include "llvm/MC/MCAssembler.h"
+ #include "llvm/MC/MCContext.h"
+ #include "llvm/MC/MCELFObjectWriter.h"
++#include "llvm/MC/MCExpr.h"
++#include "llvm/MC/MCSection.h"
+ #include "llvm/MC/MCValue.h"
+ #include "llvm/Support/Endian.h"
+ #include "llvm/Support/EndianStream.h"
+ #include "llvm/Support/LEB128.h"
++#include "llvm/Support/MathExtras.h"
+ 
+ #define DEBUG_TYPE "loongarch-asmbackend"
+ 
+@@ -177,6 +180,70 @@ void LoongArchAsmBackend::applyFixup(const MCAssembler &Asm,
+   }
+ }
+ 
++// Linker relaxation may change code size. We have to insert Nops
++// for .align directive when linker relaxation enabled. So then Linker
++// could satisfy alignment by removing Nops.
++// The function returns the total Nops Size we need to insert.
++bool LoongArchAsmBackend::shouldInsertExtraNopBytesForCodeAlign(
++    const MCAlignFragment &AF, unsigned &Size) {
++  // Calculate Nops Size only when linker relaxation enabled.
++  if (!AF.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax))
++    return false;
++
++  // Ignore alignment if MaxBytesToEmit is less than the minimum Nop size.
++  const unsigned MinNopLen = 4;
++  if (AF.getMaxBytesToEmit() < MinNopLen)
++    return false;
++  Size = AF.getAlignment().value() - MinNopLen;
++  return AF.getAlignment() > MinNopLen;
++}
++
++// We need to insert R_LARCH_ALIGN relocation type to indicate the
++// position of Nops and the total bytes of the Nops have been inserted
++// when linker relaxation enabled.
++// The function inserts fixup_loongarch_align fixup which eventually will
++// transfer to R_LARCH_ALIGN relocation type.
++// The improved R_LARCH_ALIGN requires symbol index. The lowest 8 bits of
++// addend represent alignment and the other bits of addend represent the
++// maximum number of bytes to emit. The maximum number of bytes is zero
++// means ignore the emit limit.
++bool LoongArchAsmBackend::shouldInsertFixupForCodeAlign(
++    MCAssembler &Asm, const MCAsmLayout &Layout, MCAlignFragment &AF) {
++  // Insert the fixup only when linker relaxation enabled.
++  if (!AF.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax))
++    return false;
++
++  // Calculate total Nops we need to insert. If there are none to insert
++  // then simply return.
++  unsigned Count;
++  if (!shouldInsertExtraNopBytesForCodeAlign(AF, Count))
++    return false;
++
++  MCSection *Sec = AF.getParent();
++  MCContext &Ctx = Asm.getContext();
++  const MCExpr *Dummy = MCConstantExpr::create(0, Ctx);
++  // Create fixup_loongarch_align fixup.
++  MCFixup Fixup =
++      MCFixup::create(0, Dummy, MCFixupKind(LoongArch::fixup_loongarch_align));
++  const MCSymbolRefExpr *MCSym = getSecToAlignSym()[Sec];
++  if (MCSym == nullptr) {
++    // Create a symbol and make the value of symbol is zero.
++    MCSymbol *Sym = Ctx.createNamedTempSymbol("la-relax-align");
++    Sym->setFragment(&*Sec->getBeginSymbol()->getFragment());
++    Asm.registerSymbol(*Sym);
++    MCSym = MCSymbolRefExpr::create(Sym, Ctx);
++    getSecToAlignSym()[Sec] = MCSym;
++  }
++
++  uint64_t FixedValue = 0;
++  unsigned Lo = Log2_64(Count) + 1;
++  unsigned Hi = AF.getMaxBytesToEmit() >= Count ? 0 : AF.getMaxBytesToEmit();
++  MCValue Value = MCValue::get(MCSym, nullptr, Hi << 8 | Lo);
++  Asm.getWriter().recordRelocation(Asm, Layout, &AF, Fixup, Value, FixedValue);
++
++  return true;
++}
++
+ bool LoongArchAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+                                                 const MCFixup &Fixup,
+                                                 const MCValue &Target) {
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+index 657f5ca5e731..71bbd003888a 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+@@ -17,7 +17,9 @@
+ #include "MCTargetDesc/LoongArchFixupKinds.h"
+ #include "MCTargetDesc/LoongArchMCTargetDesc.h"
+ #include "llvm/MC/MCAsmBackend.h"
++#include "llvm/MC/MCExpr.h"
+ #include "llvm/MC/MCFixupKindInfo.h"
++#include "llvm/MC/MCSection.h"
+ #include "llvm/MC/MCSubtargetInfo.h"
+ 
+ namespace llvm {
+@@ -27,6 +29,7 @@ class LoongArchAsmBackend : public MCAsmBackend {
+   uint8_t OSABI;
+   bool Is64Bit;
+   const MCTargetOptions &TargetOptions;
++  DenseMap<MCSection *, const MCSymbolRefExpr *> SecToAlignSym;
+ 
+ public:
+   LoongArchAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit,
+@@ -45,6 +48,15 @@ public:
+                   uint64_t Value, bool IsResolved,
+                   const MCSubtargetInfo *STI) const override;
+ 
++  // Return Size with extra Nop Bytes for alignment directive in code section.
++  bool shouldInsertExtraNopBytesForCodeAlign(const MCAlignFragment &AF,
++                                             unsigned &Size) override;
++
++  // Insert target specific fixup type for alignment directive in code section.
++  bool shouldInsertFixupForCodeAlign(MCAssembler &Asm,
++                                     const MCAsmLayout &Layout,
++                                     MCAlignFragment &AF) override;
++
+   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                              const MCValue &Target) override;
+ 
+@@ -79,6 +91,9 @@ public:
+   std::unique_ptr<MCObjectTargetWriter>
+   createObjectTargetWriter() const override;
+   const MCTargetOptions &getTargetOptions() const { return TargetOptions; }
++  DenseMap<MCSection *, const MCSymbolRefExpr *> &getSecToAlignSym() {
++    return SecToAlignSym;
++  }
+ };
+ } // end namespace llvm
+ 
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+index 178fa6e5262b..78414408f21f 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+@@ -108,7 +108,9 @@ enum Fixups {
+   // 20-bit fixup corresponding to %gd_hi20(foo) for instruction lu12i.w.
+   fixup_loongarch_tls_gd_hi20,
+   // Generate an R_LARCH_RELAX which indicates the linker may relax here.
+-  fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX
++  fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX,
++  // Generate an R_LARCH_ALIGN which indicates the linker may fixup align here.
++  fixup_loongarch_align = FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN,
+ };
+ } // end namespace LoongArch
+ } // end namespace llvm
+diff --git a/llvm/test/MC/LoongArch/Relocations/align-non-executable.s b/llvm/test/MC/LoongArch/Relocations/align-non-executable.s
+new file mode 100644
+index 000000000000..47834acd9521
+--- /dev/null
++++ b/llvm/test/MC/LoongArch/Relocations/align-non-executable.s
+@@ -0,0 +1,27 @@
++## A label difference separated by an alignment directive, when the
++## referenced symbols are in a non-executable section with instructions,
++## should generate ADD/SUB relocations.
++## https://github.com/llvm/llvm-project/pull/76552
++
++# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s \
++# RUN:     | llvm-readobj -r - | FileCheck --check-prefixes=CHECK,RELAX %s
++# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s \
++# RUN:     | llvm-readobj -r - | FileCheck %s
++
++.section ".dummy", "a"
++.L1:
++  la.pcrel $t0, sym
++.p2align 3
++.L2:
++.dword .L2 - .L1
++
++# CHECK:       Relocations [
++# CHECK-NEXT:    Section ({{.*}}) .rela.dummy {
++# CHECK-NEXT:      0x0 R_LARCH_PCALA_HI20 sym 0x0
++# RELAX-NEXT:      0x0 R_LARCH_RELAX - 0x0
++# CHECK-NEXT:      0x4 R_LARCH_PCALA_LO12 sym 0x0
++# RELAX-NEXT:      0x4 R_LARCH_RELAX - 0x0
++# RELAX-NEXT:      0x8 R_LARCH_ADD64 .L2 0x0
++# RELAX-NEXT:      0x8 R_LARCH_SUB64 .L1 0x0
++# CHECK-NEXT:    }
++# CHECK-NEXT:  ]
+diff --git a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
+index cd01332afd0b..18e0ede5e293 100644
+--- a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
++++ b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
+@@ -28,12 +28,23 @@
+ 
+ # RELAX:       Relocations [
+ # RELAX-NEXT:    Section ({{.*}}) .rela.text {
++# RELAX-NEXT:      0x4 R_LARCH_ALIGN {{.*}} 0x4
+ # RELAX-NEXT:      0x10 R_LARCH_PCALA_HI20 .L1 0x0
+ # RELAX-NEXT:      0x10 R_LARCH_RELAX - 0x0
+ # RELAX-NEXT:      0x14 R_LARCH_PCALA_LO12 .L1 0x0
+ # RELAX-NEXT:      0x14 R_LARCH_RELAX - 0x0
+ # RELAX-NEXT:    }
+ # RELAX-NEXT:    Section ({{.*}}) .rela.data {
++# RELAX-NEXT:      0x10 R_LARCH_ADD8 .L3 0x0
++# RELAX-NEXT:      0x10 R_LARCH_SUB8 .L2 0x0
++# RELAX-NEXT:      0x11 R_LARCH_ADD16 .L3 0x0
++# RELAX-NEXT:      0x11 R_LARCH_SUB16 .L2 0x0
++# RELAX-NEXT:      0x13 R_LARCH_ADD32 .L3 0x0
++# RELAX-NEXT:      0x13 R_LARCH_SUB32 .L2 0x0
++# RELAX-NEXT:      0x17 R_LARCH_ADD64 .L3 0x0
++# RELAX-NEXT:      0x17 R_LARCH_SUB64 .L2 0x0
++# RELAX-NEXT:      0x1F R_LARCH_ADD_ULEB128 .L3 0x0
++# RELAX-NEXT:      0x1F R_LARCH_SUB_ULEB128 .L2 0x0
+ # RELAX-NEXT:      0x20 R_LARCH_ADD8 .L4 0x0
+ # RELAX-NEXT:      0x20 R_LARCH_SUB8 .L3 0x0
+ # RELAX-NEXT:      0x21 R_LARCH_ADD16 .L4 0x0
+@@ -57,7 +68,7 @@
+ 
+ # RELAX:      Hex dump of section '.data':
+ # RELAX-NEXT: 0x00000000 04040004 00000004 00000000 00000004
+-# RELAX-NEXT: 0x00000010 0c0c000c 0000000c 00000000 0000000c
++# RELAX-NEXT: 0x00000010 00000000 00000000 00000000 00000000
+ # RELAX-NEXT: 0x00000020 00000000 00000000 00000000 00000000
+ # RELAX-NEXT: 0x00000030 00000000 00000000 00000000 000000
+ 
+@@ -78,7 +89,7 @@
+ .word  .L2 - .L1
+ .dword .L2 - .L1
+ .uleb128 .L2 - .L1
+-## TODO Handle alignment directive.
++## With relaxation, emit relocs because the .align makes the diff variable.
+ .byte  .L3 - .L2
+ .short .L3 - .L2
+ .word  .L3 - .L2
+diff --git a/llvm/test/MC/LoongArch/Relocations/relax-align.s b/llvm/test/MC/LoongArch/Relocations/relax-align.s
+new file mode 100644
+index 000000000000..294fd9fb916c
+--- /dev/null
++++ b/llvm/test/MC/LoongArch/Relocations/relax-align.s
+@@ -0,0 +1,79 @@
++## The file testing Nop insertion with R_LARCH_ALIGN for relaxation.
++
++# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o %t
++# RUN: llvm-objdump -d %t | FileCheck %s --check-prefix=INSTR
++# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=RELOC
++# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.r
++# RUN: llvm-objdump -d %t.r | FileCheck %s --check-prefixes=INSTR,RELAX-INSTR
++# RUN: llvm-readobj -r %t.r | FileCheck %s --check-prefixes=RELOC,RELAX-RELOC
++
++.text
++break 0
++# INSTR: break 0
++
++## Not emit R_LARCH_ALIGN if alignment directive is less than or equal to
++## minimum code alignment(a.k.a 4).
++.p2align 2
++.p2align 1
++.p2align 0
++
++## Not emit instructions if max emit bytes less than min nop size.
++.p2align 4, , 2
++
++## Not emit R_LARCH_ALIGN if alignment directive with specific padding value.
++## The behavior is the same as GNU assembler.
++break 1
++.p2align 4, 1
++# INSTR-NEXT:    break 1
++# INSTR-COUNT-2: 01 01 01 01
++
++break 2
++.p2align 4, 1, 12
++# INSTR-NEXT:    break 2
++# INSTR-COUNT-3: 01 01 01 01
++
++break 3
++.p2align 4
++# INSTR-NEXT:    break 3
++# INSTR-COUNT-3: nop
++
++break 4
++.p2align 5
++.p2align 4
++# INSTR-NEXT:          break 4
++# INSTR-COUNT-3:       nop
++# RELAX-INSTR-COUNT-7: nop
++
++break 5
++.p2align 4, , 11
++# INSTR-NEXT: break 5
++# RELAX-INSTR-COUNT-3: nop
++
++break 6
++## Not emit the third parameter.
++.p2align 4, , 12
++# INSTR-NEXT:       break 6
++# INSTR-NEXT:       nop
++# INSTR-NEXT:       nop
++# RELAX-INSTR-NEXT: nop
++
++ret
++# INSNR-NEXT: ret
++
++## Test the symbol index is different from .text.
++.section .text2, "ax"
++.p2align 4
++break 7
++
++# RELOC:            Relocations [
++# RELAX-RELOC-NEXT:   Section ({{.*}}) .rela.text {
++# RELAX-RELOC-NEXT:     0x24 R_LARCH_ALIGN .Lla-relax-align0 0x4
++# RELAX-RELOC-NEXT:     0x34 R_LARCH_ALIGN .Lla-relax-align0 0x5
++# RELAX-RELOC-NEXT:     0x50 R_LARCH_ALIGN .Lla-relax-align0 0x4
++# RELAX-RELOC-NEXT:     0x60 R_LARCH_ALIGN .Lla-relax-align0 0xB04
++# RELAX-RELOC-NEXT:     0x70 R_LARCH_ALIGN .Lla-relax-align0 0x4
++# RELAX-RELOC-NEXT:   }
++# RELAX-RELOC-NEXT:   Section ({{.*}}) .rela.text2 {
++# RELAX-RELOC-NEXT:     0x0 R_LARCH_ALIGN .Lla-relax-align1 0x4
++# RELAX-RELOC-NEXT:   }
++# RELOC-NEXT:       ]
+-- 
+2.20.1
+
diff --git a/0008-Backport-test-Update-dwarf-loongarch-relocs.ll.patch b/0008-Backport-test-Update-dwarf-loongarch-relocs.ll.patch
new file mode 100644
index 0000000..4ed67f0
--- /dev/null
+++ b/0008-Backport-test-Update-dwarf-loongarch-relocs.ll.patch
@@ -0,0 +1,86 @@
+From f51ee6c3468eacc82d3b3f09fcca381178bdc9e7 Mon Sep 17 00:00:00 2001
+From: Weining Lu <luweining@loongson.cn>
+Date: Wed, 24 Jan 2024 11:03:14 +0800
+Subject: [PATCH 11/14] [test] Update dwarf-loongarch-relocs.ll
+
+Address buildbot faiures:
+http://45.33.8.238/macm1/77360/step_11.txt
+http://45.33.8.238/linux/128902/step_12.txt
+
+(cherry picked from commit baba7e4175b6ca21e83b1cf8229f29dbba02e979)
+(cherry picked from commit c9e73cdd9a17f15ede120ea57657553f9e105eab)
+Change-Id: I00aa1414f556f0ba5ff6bf6a879a6fc1fcfa49e0
+---
+ .../LoongArch/dwarf-loongarch-relocs.ll       | 37 ++++++++++++-------
+ 1 file changed, 23 insertions(+), 14 deletions(-)
+
+diff --git a/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll b/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
+index e03b4c1d34de..07443a62b933 100644
+--- a/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
++++ b/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
+@@ -1,19 +1,22 @@
+ ; RUN: llc --filetype=obj --mtriple=loongarch64 --mattr=-relax %s -o %t.o
+ ; RUN: llvm-readobj -r %t.o | FileCheck --check-prefixes=RELOCS-BOTH,RELOCS-NORL %s
+-; RUN: llvm-objdump --source %t.o | FileCheck --check-prefix=SOURCE %s
+-; RUN: llvm-dwarfdump --debug-info --debug-line %t.o | FileCheck --check-prefix=DWARF %s
++; RUN: llvm-objdump --source %t.o | FileCheck --check-prefixes=SOURCE,SOURCE-NORL %s
++; RUN: llvm-dwarfdump --debug-info --debug-line %t.o | FileCheck --check-prefixes=DWARF,DWARF-NORL %s
+ 
+ ; RUN: llc --filetype=obj --mtriple=loongarch64 --mattr=+relax %s -o %t.r.o
+ ; RUN: llvm-readobj -r %t.r.o | FileCheck --check-prefixes=RELOCS-BOTH,RELOCS-ENRL %s
+-; RUN: llvm-objdump --source %t.r.o | FileCheck --check-prefix=SOURCE %s
+-; RUN: llvm-dwarfdump --debug-info --debug-line %t.r.o | FileCheck --check-prefix=DWARF %s
++; RUN: llvm-objdump --source %t.r.o | FileCheck --check-prefixes=SOURCE,SOURCE-ENRL %s
++; RUN: llvm-dwarfdump --debug-info --debug-line %t.r.o | FileCheck --check-prefixes=DWARF,DWARF-ENRL %s
+ 
+ ; RELOCS-BOTH:       Relocations [
+ ; RELOCS-BOTH-NEXT:    Section ({{.*}}) .rela.text {
+-; RELOCS-BOTH-NEXT:      0x14 R_LARCH_PCALA_HI20 sym 0x0
+-; RELOCS-ENRL-NEXT:      0x14 R_LARCH_RELAX - 0x0
+-; RELOCS-BOTH-NEXT:      0x18 R_LARCH_PCALA_LO12 sym 0x0
+-; RELOCS-ENRL-NEXT:      0x18 R_LARCH_RELAX - 0x0
++; RELOCS-NORL-NEXT:      0x14 R_LARCH_PCALA_HI20 sym 0x0
++; RELOCS-NORL-NEXT:      0x18 R_LARCH_PCALA_LO12 sym 0x0
++; RELOCS-ENRL-NEXT:      0x0 R_LARCH_ALIGN .Lla-relax-align0 0x5
++; RELOCS-ENRL-NEXT:      0x30 R_LARCH_PCALA_HI20 sym 0x0
++; RELOCS-ENRL-NEXT:      0x30 R_LARCH_RELAX - 0x0
++; RELOCS-ENRL-NEXT:      0x34 R_LARCH_PCALA_LO12 sym 0x0
++; RELOCS-ENRL-NEXT:      0x34 R_LARCH_RELAX - 0x0
+ ; RELOCS-BOTH-NEXT:    }
+ ; RELOCS-BOTH:         Section ({{.*}}) .rela.debug_frame {
+ ; RELOCS-NORL-NEXT:      0x1C R_LARCH_32 .debug_frame 0x0
+@@ -36,7 +39,8 @@
+ ; RELOCS-BOTH-NEXT:    }
+ ; RELOCS-BOTH-NEXT:  ]
+ 
+-; SOURCE:  0000000000000000 <foo>:
++; SOURCE-NORL:  0000000000000000 <foo>:
++; SOURCE-ENRL:  000000000000001c <foo>:
+ ; SOURCE:  ; {
+ ; SOURCE:  ;   asm volatile(
+ ; SOURCE:  ;   return 0;
+@@ -87,11 +91,16 @@
+ ; DWARF-EMPTY:
+ ; DWARF-NEXT:  Address            Line   Column File   ISA Discriminator OpIndex Flags
+ ; DWARF-NEXT:  ------------------ ------ ------ ------ --- ------------- ------- -------------
+-; DWARF-NEXT:  0x0000000000000000      2      0      0   0             0       0  is_stmt
+-; DWARF-NEXT:  0x0000000000000010      3      3      0   0             0       0  is_stmt prologue_end
+-; DWARF-NEXT:  0x0000000000000020     10      3      0   0             0       0  is_stmt
+-; DWARF-NEXT:  0x000000000000002c     10      3      0   0             0       0  epilogue_begin
+-; DWARF-NEXT:  0x0000000000000034     10      3      0   0             0       0  end_sequence
++; DWARF-NORL-NEXT:  0x0000000000000000      2      0      0   0             0       0  is_stmt
++; DWARF-NORL-NEXT:  0x0000000000000010      3      3      0   0             0       0  is_stmt prologue_end
++; DWARF-NORL-NEXT:  0x0000000000000020     10      3      0   0             0       0  is_stmt
++; DWARF-NORL-NEXT:  0x000000000000002c     10      3      0   0             0       0  epilogue_begin
++; DWARF-NORL-NEXT:  0x0000000000000034     10      3      0   0             0       0  end_sequence
++; DWARF-ENRL-NEXT:  0x000000000000001c      2      0      0   0             0       0  is_stmt
++; DWARF-ENRL-NEXT:  0x000000000000002c      3      3      0   0             0       0  is_stmt prologue_end
++; DWARF-ENRL-NEXT:  0x000000000000003c     10      3      0   0             0       0  is_stmt
++; DWARF-ENRL-NEXT:  0x0000000000000048     10      3      0   0             0       0  epilogue_begin
++; DWARF-ENRL-NEXT:  0x0000000000000050     10      3      0   0             0       0  end_sequence
+ 
+ ; ModuleID = 'dwarf-loongarch-relocs.c'
+ source_filename = "dwarf-loongarch-relocs.c"
+-- 
+2.20.1
+
diff --git a/0009-Backport-MC-test-Change-ELF-uleb-ehtable.s-Mach-O-to-use-private-symbols-in-.uleb128-for-label-differences.patch b/0009-Backport-MC-test-Change-ELF-uleb-ehtable.s-Mach-O-to-use-private-symbols-in-.uleb128-for-label-differences.patch
new file mode 100644
index 0000000..94bb772
--- /dev/null
+++ b/0009-Backport-MC-test-Change-ELF-uleb-ehtable.s-Mach-O-to-use-private-symbols-in-.uleb128-for-label-differences.patch
@@ -0,0 +1,53 @@
+From 442b5109ccbabed1110c122c1ca92d4194ba632b Mon Sep 17 00:00:00 2001
+From: Fangrui Song <i@maskray.me>
+Date: Wed, 9 Aug 2023 21:42:18 -0700
+Subject: [PATCH 13/14] [MC][test] Change ELF/uleb-ehtable.s Mach-O to use
+ private symbols in .uleb128 for label differences
+
+On Mach-O, `.uleb128 A-B` where A and B are separated by a non-private symbol is invalid
+(see D153167).
+
+(cherry picked from commit 0a89bda4a8b756a00985e0965f7686b5ceb43295)
+Change-Id: I92ed11d6913b8c781e29be6e8c642cf0a371910d
+---
+ llvm/test/MC/ELF/uleb-ehtable.s | 13 +++++++++++--
+ 1 file changed, 11 insertions(+), 2 deletions(-)
+
+diff --git a/llvm/test/MC/ELF/uleb-ehtable.s b/llvm/test/MC/ELF/uleb-ehtable.s
+index ca3f9e97bffc..6407223f36e7 100644
+--- a/llvm/test/MC/ELF/uleb-ehtable.s
++++ b/llvm/test/MC/ELF/uleb-ehtable.s
+@@ -1,7 +1,7 @@
+ // RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu    %s -o - | llvm-readobj -S --sd - | FileCheck %s -check-prefix=CHECK -check-prefix=ELF
+ // RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu  %s -o - | llvm-readobj -S --sd - | FileCheck %s -check-prefix=CHECK -check-prefix=ELF
+-// RUN: llvm-mc -filetype=obj -triple i386-apple-darwin9   %s -o - | llvm-readobj -S --sd - | FileCheck %s -check-prefix=CHECK -check-prefix=MACHO
+-// RUN: llvm-mc -filetype=obj -triple x86_64-apple-darwin9 %s -o - | llvm-readobj -S --sd - | FileCheck %s -check-prefix=CHECK -check-prefix=MACHO
++// RUN: llvm-mc -filetype=obj -triple i386-apple-darwin9 --defsym MACHO=1 %s -o - | llvm-readobj -S --sd - | FileCheck %s -check-prefix=CHECK -check-prefix=MACHO
++// RUN: llvm-mc -filetype=obj -triple x86_64-apple-darwin9 --defsym MACHO=1 %s -o - | llvm-readobj -S --sd - | FileCheck %s -check-prefix=CHECK -check-prefix=MACHO
+ 
+ // Test that we can assemble a GCC-like EH table that has 16381-16383 bytes of
+ // non-padding data between .ttbaseref and .ttbase. The assembler must insert
+@@ -13,11 +13,20 @@
+ foo:
+         .byte 0xff  // LPStart omitted
+         .byte 0x1   // TType encoding (uleb128)
++.ifdef MACHO
++        .uleb128 Lttbase-Lttbaseref
++Lttbaseref:
++.else
+         .uleb128 .ttbase-.ttbaseref
+ .ttbaseref:
++.endif
+         .fill 128*128-1, 1, 0xcd    // call site and actions tables
+         .balign 4
++.ifdef MACHO
++Lttbase:
++.else
+ .ttbase:
++.endif
+         .byte 1, 2, 3, 4
+ 
+ // ELF:   Name: .data
+-- 
+2.20.1
+
diff --git a/0010-Backport-Mips-MC-AttemptToFoldSymbolOffsetDifference-revert-isMicroMips-special-case.patch b/0010-Backport-Mips-MC-AttemptToFoldSymbolOffsetDifference-revert-isMicroMips-special-case.patch
new file mode 100644
index 0000000..1d370ee
--- /dev/null
+++ b/0010-Backport-Mips-MC-AttemptToFoldSymbolOffsetDifference-revert-isMicroMips-special-case.patch
@@ -0,0 +1,135 @@
+From 3b777f98a3997f338919af7ff1ef8a6fd07f76a0 Mon Sep 17 00:00:00 2001
+From: Fangrui Song <i@maskray.me>
+Date: Wed, 16 Aug 2023 23:11:59 -0700
+Subject: [PATCH 14/14] [Mips][MC] AttemptToFoldSymbolOffsetDifference: revert
+ isMicroMips special case
+
+D52985/D57677 added a .gcc_except_table workaround, but the new behavior
+doesn't match GNU assembler.
+```
+void foo();
+int bar() {
+  foo();
+  try { throw 1; }
+  catch (int) { return 1; }
+  return 0;
+}
+
+clang --target=mipsel-linux-gnu -mmicromips -S a.cc
+mipsel-linux-gnu-gcc -mmicromips -c a.s -o gnu.o
+
+.uleb128 ($cst_end0)-($cst_begin0)     // bit 0 is not forced to 1
+.uleb128 ($func_begin0)-($func_begin0) // bit 0 is not forced to 1
+```
+
+I have inspected `.gcc_except_table` output by `mipsel-linux-gnu-gcc -mmicromips -c a.cc`.
+The `.uleb128` values are not forced to set the least significant bit.
+
+In addition, D57677's adjustment (even->odd) to CodeGen/Mips/micromips-b-range.ll is wrong.
+PC-relative `.long func - .` values will differ from GNU assembler as well.
+
+The original intention of D52985 seems unclear to me. I think whatever
+goal it wants to achieve should be moved to an upper layer.
+
+This isMicroMips special case has caused problems to fix MCAssembler::relaxLEB to use evaluateAsAbsolute instead of evaluateKnownAbsolute,
+which is needed to proper support R_RISCV_SET_ULEB128/R_RISCV_SUB_ULEB128.
+
+Differential Revision: https://reviews.llvm.org/D157655
+
+(cherry picked from commit 4c89277095ee7cda3d20e0f5f18b384212069778)
+Change-Id: Iedd73e0c61856c30fde442309fc16d4327829f1a
+---
+ llvm/lib/MC/MCExpr.cpp                               | 5 -----
+ llvm/test/CodeGen/Mips/micromips-b-range.ll          | 8 ++++----
+ llvm/test/CodeGen/Mips/micromips-gcc-except-table.ll | 2 +-
+ llvm/test/DebugInfo/Mips/eh_frame.ll                 | 4 ++--
+ 4 files changed, 7 insertions(+), 12 deletions(-)
+
+diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
+index 79808a58d81c..c9ff1865cf91 100644
+--- a/llvm/lib/MC/MCExpr.cpp
++++ b/llvm/lib/MC/MCExpr.cpp
+@@ -611,11 +611,6 @@ static void AttemptToFoldSymbolOffsetDifference(
+     if (Asm->isThumbFunc(&SA))
+       Addend |= 1;
+ 
+-    // If symbol is labeled as micromips, we set low-bit to ensure
+-    // correct offset in .gcc_except_table
+-    if (Asm->getBackend().isMicroMips(&SA))
+-      Addend |= 1;
+-
+     // Clear the symbol expr pointers to indicate we have folded these
+     // operands.
+     A = B = nullptr;
+diff --git a/llvm/test/CodeGen/Mips/micromips-b-range.ll b/llvm/test/CodeGen/Mips/micromips-b-range.ll
+index 064afff3da0e..81d1c04208cc 100644
+--- a/llvm/test/CodeGen/Mips/micromips-b-range.ll
++++ b/llvm/test/CodeGen/Mips/micromips-b-range.ll
+@@ -13,7 +13,7 @@
+ ; CHECK-NEXT:    1e:	fb fd 00 00 	sw	$ra, 0($sp)
+ ; CHECK-NEXT:    22:	41 a1 00 01 	lui	$1, 1
+ ; CHECK-NEXT:    26:	40 60 00 02 	bal	0x2e <foo+0x2e>
+-; CHECK-NEXT:    2a:	30 21 04 69 	addiu	$1, $1, 1129
++; CHECK-NEXT:    2a:	30 21 04 68 	addiu	$1, $1, 1128
+ ; CHECK-NEXT:    2e:	00 3f 09 50 	addu	$1, $ra, $1
+ ; CHECK-NEXT:    32:	ff fd 00 00 	lw	$ra, 0($sp)
+ ; CHECK-NEXT:    36:	00 01 0f 3c 	jr	$1
+@@ -27,7 +27,7 @@
+ ; CHECK-NEXT:    56:	fb fd 00 00 	sw	$ra, 0($sp)
+ ; CHECK-NEXT:    5a:	41 a1 00 01 	lui	$1, 1
+ ; CHECK-NEXT:    5e:	40 60 00 02 	bal	0x66 <foo+0x66>
+-; CHECK-NEXT:    62:	30 21 04 5d 	addiu	$1, $1, 1117
++; CHECK-NEXT:    62:	30 21 04 5c 	addiu	$1, $1, 1116
+ ; CHECK-NEXT:    66:	00 3f 09 50 	addu	$1, $ra, $1
+ ; CHECK-NEXT:    6a:	ff fd 00 00 	lw	$ra, 0($sp)
+ ; CHECK-NEXT:    6e:	00 01 0f 3c 	jr	$1
+@@ -39,7 +39,7 @@
+ ; CHECK-NEXT:    86:	fb fd 00 00 	sw	$ra, 0($sp)
+ ; CHECK-NEXT:    8a:	41 a1 00 01 	lui	$1, 1
+ ; CHECK-NEXT:    8e:	40 60 00 02 	bal	0x96 <foo+0x96>
+-; CHECK-NEXT:    92:	30 21 04 2d 	addiu	$1, $1, 1069
++; CHECK-NEXT:    92:	30 21 04 2c 	addiu	$1, $1, 1068
+ ; CHECK-NEXT:    96:	00 3f 09 50 	addu	$1, $ra, $1
+ ; CHECK-NEXT:    9a:	ff fd 00 00 	lw	$ra, 0($sp)
+ ; CHECK-NEXT:    9e:	00 01 0f 3c 	jr	$1
+@@ -51,7 +51,7 @@
+ ; CHECK-NEXT: 10476:	fb fd 00 00 	sw	$ra, 0($sp)
+ ; CHECK-NEXT: 1047a:	41 a1 00 01 	lui	$1, 1
+ ; CHECK-NEXT: 1047e:	40 60 00 02 	bal	0x10486 <foo+0x10486>
+-; CHECK-NEXT: 10482:	30 21 04 01 	addiu	$1, $1, 1025
++; CHECK-NEXT: 10482:	30 21 04 00 	addiu	$1, $1, 1024
+ ; CHECK-NEXT: 10486:	00 3f 09 50 	addu	$1, $ra, $1
+ ; CHECK-NEXT: 1048a:	ff fd 00 00 	lw	$ra, 0($sp)
+ ; CHECK-NEXT: 1048e:	00 01 0f 3c 	jr	$1
+diff --git a/llvm/test/CodeGen/Mips/micromips-gcc-except-table.ll b/llvm/test/CodeGen/Mips/micromips-gcc-except-table.ll
+index 2b63aff01574..20d64fc216b7 100644
+--- a/llvm/test/CodeGen/Mips/micromips-gcc-except-table.ll
++++ b/llvm/test/CodeGen/Mips/micromips-gcc-except-table.ll
+@@ -1,7 +1,7 @@
+ ; RUN: llc -mtriple=mips-linux-gnu -mcpu=mips32r2 -mattr=+micromips -O3 -filetype=obj < %s | llvm-objdump -s -j .gcc_except_table - | FileCheck %s
+ 
+ ; CHECK: Contents of section .gcc_except_table:
+-; CHECK-NEXT: 0000 ff9b1501 0c011100 00110e1f 011f1800
++; CHECK-NEXT: 0000 ff9b1501 0c001000 00100e1e 011e1800
+ ; CHECK-NEXT: 0010 00010000 00000000
+ 
+ @_ZTIi = external constant ptr
+diff --git a/llvm/test/DebugInfo/Mips/eh_frame.ll b/llvm/test/DebugInfo/Mips/eh_frame.ll
+index 506e5b87892b..60d4dc76777e 100644
+--- a/llvm/test/DebugInfo/Mips/eh_frame.ll
++++ b/llvm/test/DebugInfo/Mips/eh_frame.ll
+@@ -26,9 +26,9 @@
+ ; CHECK-READELF-PIC-NEXT: R_MIPS_PC32
+ ; CHECK-READELF-NEXT: .gcc_except_table
+ 
+-; EXCEPT-TABLE-STATIC: 0000 ff9b1501 0c011500 00150e23 01231e00  ...........#.#..
++; EXCEPT-TABLE-STATIC: 0000 ff9b1501 0c001400 00140e22 01221e00 ..........."."..
+ ; EXCEPT-TABLE-STATIC: 0010 00010000 00000000
+-; EXCEPT-TABLE-PIC:    0000 ff9b1501 0c012d00 002d133f 013f2a00 ......-..-.?.?*.
++; EXCEPT-TABLE-PIC:    0000 ff9b1501 0c002c00 002c123e 013e2a00 ......,..,.>.>*.
+ ; EXCEPT-TABLE-PIC:    0010 00010000 00000000                    ........
+ 
+ @_ZTIi = external constant ptr
+-- 
+2.20.1
+
diff --git a/0011-Backport-LoongArch-Add-the-support-for-vector-in-llvm17.patch b/0011-Backport-LoongArch-Add-the-support-for-vector-in-llvm17.patch
new file mode 100644
index 0000000..0976d4e
--- /dev/null
+++ b/0011-Backport-LoongArch-Add-the-support-for-vector-in-llvm17.patch
@@ -0,0 +1,56520 @@
+From 6ff32ae0ca7a400249535b19d9ca489b44deae19 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Wed, 9 Aug 2023 16:01:37 +0800
+Subject: [PATCH 01/35] [Clang][LoongArch] Use the ClangBuiltin class to
+ automatically generate support for CBE and CFE
+
+Fixed the type modifier (L->W), removed redundant feature checking code
+since the feature has already been checked in `EmitBuiltinExpr`. And
+Cleaned up unused diagnostic information.
+
+Reviewed By: SixWeining
+
+Differential Revision: https://reviews.llvm.org/D156866
+
+(cherry picked from commit ea8d3b1f9f2d7385d97fcd34d14db0eb2cb2795c)
+---
+ llvm/include/llvm/IR/IntrinsicsLoongArch.td | 141 ++++++++++----------
+ llvm/lib/IR/Function.cpp                    |   1 +
+ 2 files changed, 72 insertions(+), 70 deletions(-)
+
+diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+index 5edce3c529e1..4219b2f55346 100644
+--- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td
++++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+@@ -51,74 +51,75 @@ defm int_loongarch_masked_cmpxchg : MaskedAtomicRMWFiveOpIntrinsics;
+ //===----------------------------------------------------------------------===//
+ // LoongArch BASE
+ 
+-def int_loongarch_break : Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
+-def int_loongarch_cacop_d : Intrinsic<[], [llvm_i64_ty, llvm_i64_ty, llvm_i64_ty],
+-    [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
+-def int_loongarch_cacop_w : Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+-    [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
+-def int_loongarch_dbar : Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
+-def int_loongarch_ibar : Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
+-def int_loongarch_movfcsr2gr : Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+-                               [ImmArg<ArgIndex<0>>]>;
+-def int_loongarch_movgr2fcsr : Intrinsic<[], [llvm_i32_ty, llvm_i32_ty],
+-                               [ImmArg<ArgIndex<0>>]>;
+-def int_loongarch_syscall : Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
+-
+-def int_loongarch_crc_w_b_w : Intrinsic<[llvm_i32_ty],
+-                                        [llvm_i32_ty, llvm_i32_ty]>;
+-def int_loongarch_crc_w_h_w : Intrinsic<[llvm_i32_ty],
+-                                        [llvm_i32_ty, llvm_i32_ty]>;
+-def int_loongarch_crc_w_w_w : Intrinsic<[llvm_i32_ty],
+-                                        [llvm_i32_ty, llvm_i32_ty]>;
+-def int_loongarch_crc_w_d_w : Intrinsic<[llvm_i32_ty],
+-                                        [llvm_i64_ty, llvm_i32_ty]>;
+-
+-def int_loongarch_crcc_w_b_w : Intrinsic<[llvm_i32_ty],
+-                                         [llvm_i32_ty, llvm_i32_ty]>;
+-def int_loongarch_crcc_w_h_w : Intrinsic<[llvm_i32_ty],
+-                                         [llvm_i32_ty, llvm_i32_ty]>;
+-def int_loongarch_crcc_w_w_w : Intrinsic<[llvm_i32_ty],
+-                                         [llvm_i32_ty, llvm_i32_ty]>;
+-def int_loongarch_crcc_w_d_w : Intrinsic<[llvm_i32_ty],
+-                                         [llvm_i64_ty, llvm_i32_ty]>;
+-
+-def int_loongarch_csrrd_w : Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+-                                      [ImmArg<ArgIndex<0>>]>;
+-def int_loongarch_csrrd_d : Intrinsic<[llvm_i64_ty], [llvm_i32_ty],
+-                                      [ImmArg<ArgIndex<0>>]>;
+-def int_loongarch_csrwr_w : Intrinsic<[llvm_i32_ty],
+-                                      [llvm_i32_ty, llvm_i32_ty],
+-                                      [ImmArg<ArgIndex<1>>]>;
+-def int_loongarch_csrwr_d : Intrinsic<[llvm_i64_ty],
+-                                      [llvm_i64_ty, llvm_i32_ty],
+-                                      [ImmArg<ArgIndex<1>>]>;
+-def int_loongarch_csrxchg_w : Intrinsic<[llvm_i32_ty],
+-                                        [llvm_i32_ty, llvm_i32_ty,
+-                                         llvm_i32_ty],
+-                                        [ImmArg<ArgIndex<2>>]>;
+-def int_loongarch_csrxchg_d : Intrinsic<[llvm_i64_ty],
+-                                        [llvm_i64_ty, llvm_i64_ty,
+-                                         llvm_i32_ty],
+-                                        [ImmArg<ArgIndex<2>>]>;
+-
+-def int_loongarch_iocsrrd_b : Intrinsic<[llvm_i32_ty], [llvm_i32_ty]>;
+-def int_loongarch_iocsrrd_h : Intrinsic<[llvm_i32_ty], [llvm_i32_ty]>;
+-def int_loongarch_iocsrrd_w : Intrinsic<[llvm_i32_ty], [llvm_i32_ty]>;
+-def int_loongarch_iocsrrd_d : Intrinsic<[llvm_i64_ty], [llvm_i32_ty]>;
+-
+-def int_loongarch_iocsrwr_b : Intrinsic<[], [llvm_i32_ty, llvm_i32_ty]>;
+-def int_loongarch_iocsrwr_h : Intrinsic<[], [llvm_i32_ty, llvm_i32_ty]>;
+-def int_loongarch_iocsrwr_w : Intrinsic<[], [llvm_i32_ty, llvm_i32_ty]>;
+-def int_loongarch_iocsrwr_d : Intrinsic<[], [llvm_i64_ty, llvm_i32_ty]>;
+-
+-def int_loongarch_cpucfg : Intrinsic<[llvm_i32_ty], [llvm_i32_ty]>;
+-
+-def int_loongarch_asrtle_d : Intrinsic<[], [llvm_i64_ty, llvm_i64_ty]>;
+-def int_loongarch_asrtgt_d : Intrinsic<[], [llvm_i64_ty, llvm_i64_ty]>;
+-
+-def int_loongarch_lddir_d : Intrinsic<[llvm_i64_ty],
+-                                      [llvm_i64_ty, llvm_i64_ty],
+-                                      [ImmArg<ArgIndex<1>>]>;
+-def int_loongarch_ldpte_d : Intrinsic<[], [llvm_i64_ty, llvm_i64_ty],
+-                                          [ImmArg<ArgIndex<1>>]>;
++class BaseInt<list<LLVMType> ret_types, list<LLVMType> param_types,
++              list<IntrinsicProperty> intr_properties = []>
++    : Intrinsic<ret_types, param_types, intr_properties>,
++      ClangBuiltin<!subst("int_loongarch", "__builtin_loongarch", NAME)>;
++
++def int_loongarch_break : BaseInt<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
++def int_loongarch_cacop_d : BaseInt<[], [llvm_i64_ty, llvm_i64_ty, llvm_i64_ty],
++                                    [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
++def int_loongarch_cacop_w : BaseInt<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
++                                    [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>]>;
++def int_loongarch_dbar : BaseInt<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
++
++def int_loongarch_ibar : BaseInt<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
++def int_loongarch_movfcsr2gr : BaseInt<[llvm_i32_ty], [llvm_i32_ty],
++                                       [ImmArg<ArgIndex<0>>]>;
++def int_loongarch_movgr2fcsr : BaseInt<[], [llvm_i32_ty, llvm_i32_ty],
++                                       [ImmArg<ArgIndex<0>>]>;
++def int_loongarch_syscall : BaseInt<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
++
++def int_loongarch_crc_w_b_w : BaseInt<[llvm_i32_ty],
++                                      [llvm_i32_ty, llvm_i32_ty]>;
++def int_loongarch_crc_w_h_w : BaseInt<[llvm_i32_ty],
++                                      [llvm_i32_ty, llvm_i32_ty]>;
++def int_loongarch_crc_w_w_w : BaseInt<[llvm_i32_ty],
++                                      [llvm_i32_ty, llvm_i32_ty]>;
++def int_loongarch_crc_w_d_w : BaseInt<[llvm_i32_ty],
++                                      [llvm_i64_ty, llvm_i32_ty]>;
++
++def int_loongarch_crcc_w_b_w : BaseInt<[llvm_i32_ty],
++                                       [llvm_i32_ty, llvm_i32_ty]>;
++def int_loongarch_crcc_w_h_w : BaseInt<[llvm_i32_ty],
++                                       [llvm_i32_ty, llvm_i32_ty]>;
++def int_loongarch_crcc_w_w_w : BaseInt<[llvm_i32_ty],
++                                       [llvm_i32_ty, llvm_i32_ty]>;
++def int_loongarch_crcc_w_d_w : BaseInt<[llvm_i32_ty],
++                                       [llvm_i64_ty, llvm_i32_ty]>;
++
++def int_loongarch_csrrd_w : BaseInt<[llvm_i32_ty], [llvm_i32_ty],
++                                    [ImmArg<ArgIndex<0>>]>;
++def int_loongarch_csrrd_d : BaseInt<[llvm_i64_ty], [llvm_i32_ty],
++                                    [ImmArg<ArgIndex<0>>]>;
++def int_loongarch_csrwr_w : BaseInt<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
++                                    [ImmArg<ArgIndex<1>>]>;
++def int_loongarch_csrwr_d : BaseInt<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
++                                    [ImmArg<ArgIndex<1>>]>;
++def int_loongarch_csrxchg_w : BaseInt<[llvm_i32_ty],
++                                      [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
++                                      [ImmArg<ArgIndex<2>>]>;
++def int_loongarch_csrxchg_d : BaseInt<[llvm_i64_ty],
++                                      [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty],
++                                      [ImmArg<ArgIndex<2>>]>;
++
++def int_loongarch_iocsrrd_b : BaseInt<[llvm_i32_ty], [llvm_i32_ty]>;
++def int_loongarch_iocsrrd_h : BaseInt<[llvm_i32_ty], [llvm_i32_ty]>;
++def int_loongarch_iocsrrd_w : BaseInt<[llvm_i32_ty], [llvm_i32_ty]>;
++def int_loongarch_iocsrrd_d : BaseInt<[llvm_i64_ty], [llvm_i32_ty]>;
++
++def int_loongarch_iocsrwr_b : BaseInt<[], [llvm_i32_ty, llvm_i32_ty]>;
++def int_loongarch_iocsrwr_h : BaseInt<[], [llvm_i32_ty, llvm_i32_ty]>;
++def int_loongarch_iocsrwr_w : BaseInt<[], [llvm_i32_ty, llvm_i32_ty]>;
++def int_loongarch_iocsrwr_d : BaseInt<[], [llvm_i64_ty, llvm_i32_ty]>;
++
++def int_loongarch_cpucfg : BaseInt<[llvm_i32_ty], [llvm_i32_ty]>;
++
++def int_loongarch_asrtle_d : BaseInt<[], [llvm_i64_ty, llvm_i64_ty]>;
++def int_loongarch_asrtgt_d : BaseInt<[], [llvm_i64_ty, llvm_i64_ty]>;
++
++def int_loongarch_lddir_d : BaseInt<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
++                                    [ImmArg<ArgIndex<1>>]>;
++def int_loongarch_ldpte_d : BaseInt<[], [llvm_i64_ty, llvm_i64_ty],
++                                    [ImmArg<ArgIndex<1>>]>;
+ } // TargetPrefix = "loongarch"
+diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
+index 27219e89dc5f..435800d9e5f9 100644
+--- a/llvm/lib/IR/Function.cpp
++++ b/llvm/lib/IR/Function.cpp
+@@ -37,6 +37,7 @@
+ #include "llvm/IR/IntrinsicsBPF.h"
+ #include "llvm/IR/IntrinsicsDirectX.h"
+ #include "llvm/IR/IntrinsicsHexagon.h"
++#include "llvm/IR/IntrinsicsLoongArch.h"
+ #include "llvm/IR/IntrinsicsMips.h"
+ #include "llvm/IR/IntrinsicsNVPTX.h"
+ #include "llvm/IR/IntrinsicsPowerPC.h"
+-- 
+2.20.1
+
+
+From fca9d0a876fb72d3b483044a7616d27a47121512 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Sat, 19 Aug 2023 15:58:38 +0800
+Subject: [PATCH 02/35] [LoongArch] Add LSX intrinsic support
+
+For handling intrinsics, our approach is not simply to match them
+one-to-one with instructions. Instead, we lower some intrinsics
+to common nodes and then perform matching. The advantage of this
+approach is that it allows us to fully utilize the passes available
+at the common layer for optimizing purposes.
+
+We perform error checks on the immediate operand of all intrinsics,
+rather than waiting until the end to throw exceptions.
+
+Reviewed By: SixWeining
+
+Differential Revision: https://reviews.llvm.org/D155829
+
+(cherry picked from commit 53141b2fcfa20616970833e6513537d211116c05)
+---
+ llvm/include/llvm/IR/IntrinsicsLoongArch.td   | 524 ++++++++++
+ .../LoongArch/LoongArchISelDAGToDAG.cpp       | 100 +-
+ .../Target/LoongArch/LoongArchISelDAGToDAG.h  |   8 +
+ .../LoongArch/LoongArchISelLowering.cpp       | 902 +++++++++++++++++-
+ .../Target/LoongArch/LoongArchISelLowering.h  |  14 +
+ .../Target/LoongArch/LoongArchInstrInfo.cpp   |  12 +
+ .../Target/LoongArch/LoongArchInstrInfo.td    |   6 +-
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td | 816 ++++++++++++++++
+ 8 files changed, 2359 insertions(+), 23 deletions(-)
+
+diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+index 4219b2f55346..d39d8261ebe3 100644
+--- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td
++++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+@@ -123,3 +123,527 @@ def int_loongarch_lddir_d : BaseInt<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+ def int_loongarch_ldpte_d : BaseInt<[], [llvm_i64_ty, llvm_i64_ty],
+                                     [ImmArg<ArgIndex<1>>]>;
+ } // TargetPrefix = "loongarch"
++
++/// Vector intrinsic
++
++class VecInt<list<LLVMType> ret_types, list<LLVMType> param_types,
++             list<IntrinsicProperty> intr_properties = []>
++    : Intrinsic<ret_types, param_types, intr_properties>,
++      ClangBuiltin<!subst("int_loongarch", "__builtin", NAME)>;
++
++//===----------------------------------------------------------------------===//
++// LSX
++
++let TargetPrefix = "loongarch" in {
++
++foreach inst = ["vadd_b", "vsub_b",
++                "vsadd_b", "vsadd_bu", "vssub_b", "vssub_bu",
++                "vavg_b", "vavg_bu", "vavgr_b", "vavgr_bu",
++                "vabsd_b", "vabsd_bu", "vadda_b",
++                "vmax_b", "vmax_bu", "vmin_b", "vmin_bu",
++                "vmul_b", "vmuh_b", "vmuh_bu",
++                "vdiv_b", "vdiv_bu", "vmod_b", "vmod_bu", "vsigncov_b",
++                "vand_v", "vor_v", "vxor_v", "vnor_v", "vandn_v", "vorn_v",
++                "vsll_b", "vsrl_b", "vsra_b", "vrotr_b", "vsrlr_b", "vsrar_b",
++                "vbitclr_b", "vbitset_b", "vbitrev_b",
++                "vseq_b", "vsle_b", "vsle_bu", "vslt_b", "vslt_bu",
++                "vpackev_b", "vpackod_b", "vpickev_b", "vpickod_b",
++                "vilvl_b", "vilvh_b"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v16i8_ty],
++                                       [llvm_v16i8_ty, llvm_v16i8_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vadd_h", "vsub_h",
++                "vsadd_h", "vsadd_hu", "vssub_h", "vssub_hu",
++                "vavg_h", "vavg_hu", "vavgr_h", "vavgr_hu",
++                "vabsd_h", "vabsd_hu", "vadda_h",
++                "vmax_h", "vmax_hu", "vmin_h", "vmin_hu",
++                "vmul_h", "vmuh_h", "vmuh_hu",
++                "vdiv_h", "vdiv_hu", "vmod_h", "vmod_hu", "vsigncov_h",
++                "vsll_h", "vsrl_h", "vsra_h", "vrotr_h", "vsrlr_h", "vsrar_h",
++                "vbitclr_h", "vbitset_h", "vbitrev_h",
++                "vseq_h", "vsle_h", "vsle_hu", "vslt_h", "vslt_hu",
++                "vpackev_h", "vpackod_h", "vpickev_h", "vpickod_h",
++                "vilvl_h", "vilvh_h"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v8i16_ty],
++                                       [llvm_v8i16_ty, llvm_v8i16_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vadd_w", "vsub_w",
++                "vsadd_w", "vsadd_wu", "vssub_w", "vssub_wu",
++                "vavg_w", "vavg_wu", "vavgr_w", "vavgr_wu",
++                "vabsd_w", "vabsd_wu", "vadda_w",
++                "vmax_w", "vmax_wu", "vmin_w", "vmin_wu",
++                "vmul_w", "vmuh_w", "vmuh_wu",
++                "vdiv_w", "vdiv_wu", "vmod_w", "vmod_wu", "vsigncov_w",
++                "vsll_w", "vsrl_w", "vsra_w", "vrotr_w", "vsrlr_w", "vsrar_w",
++                "vbitclr_w", "vbitset_w", "vbitrev_w",
++                "vseq_w", "vsle_w", "vsle_wu", "vslt_w", "vslt_wu",
++                "vpackev_w", "vpackod_w", "vpickev_w", "vpickod_w",
++                "vilvl_w", "vilvh_w"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v4i32_ty],
++                                       [llvm_v4i32_ty, llvm_v4i32_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vadd_d", "vadd_q", "vsub_d", "vsub_q",
++                "vsadd_d", "vsadd_du", "vssub_d", "vssub_du",
++                "vhaddw_q_d", "vhaddw_qu_du", "vhsubw_q_d", "vhsubw_qu_du",
++                "vaddwev_q_d", "vaddwod_q_d", "vsubwev_q_d", "vsubwod_q_d",
++                "vaddwev_q_du", "vaddwod_q_du", "vsubwev_q_du", "vsubwod_q_du",
++                "vaddwev_q_du_d", "vaddwod_q_du_d",
++                "vavg_d", "vavg_du", "vavgr_d", "vavgr_du",
++                "vabsd_d", "vabsd_du", "vadda_d",
++                "vmax_d", "vmax_du", "vmin_d", "vmin_du",
++                "vmul_d", "vmuh_d", "vmuh_du",
++                "vmulwev_q_d", "vmulwod_q_d", "vmulwev_q_du", "vmulwod_q_du",
++                "vmulwev_q_du_d", "vmulwod_q_du_d",
++                "vdiv_d", "vdiv_du", "vmod_d", "vmod_du", "vsigncov_d",
++                "vsll_d", "vsrl_d", "vsra_d", "vrotr_d", "vsrlr_d", "vsrar_d",
++                "vbitclr_d", "vbitset_d", "vbitrev_d",
++                "vseq_d", "vsle_d", "vsle_du", "vslt_d", "vslt_du",
++                "vpackev_d", "vpackod_d", "vpickev_d", "vpickod_d",
++                "vilvl_d", "vilvh_d"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v2i64_ty],
++                                       [llvm_v2i64_ty, llvm_v2i64_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vaddi_bu", "vsubi_bu",
++                "vmaxi_b", "vmaxi_bu", "vmini_b", "vmini_bu",
++                "vsat_b", "vsat_bu",
++                "vandi_b", "vori_b", "vxori_b", "vnori_b",
++                "vslli_b", "vsrli_b", "vsrai_b", "vrotri_b",
++                "vsrlri_b", "vsrari_b",
++                "vbitclri_b", "vbitseti_b", "vbitrevi_b",
++                "vseqi_b", "vslei_b", "vslei_bu", "vslti_b", "vslti_bu",
++                "vreplvei_b", "vbsll_v", "vbsrl_v", "vshuf4i_b"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v16i8_ty],
++                                       [llvm_v16i8_ty, llvm_i32_ty],
++                                       [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++foreach inst = ["vaddi_hu", "vsubi_hu",
++                "vmaxi_h", "vmaxi_hu", "vmini_h", "vmini_hu",
++                "vsat_h", "vsat_hu",
++                "vslli_h", "vsrli_h", "vsrai_h", "vrotri_h",
++                "vsrlri_h", "vsrari_h",
++                "vbitclri_h", "vbitseti_h", "vbitrevi_h",
++                "vseqi_h", "vslei_h", "vslei_hu", "vslti_h", "vslti_hu",
++                "vreplvei_h", "vshuf4i_h"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v8i16_ty],
++                                       [llvm_v8i16_ty, llvm_i32_ty],
++                                       [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++foreach inst = ["vaddi_wu", "vsubi_wu",
++                "vmaxi_w", "vmaxi_wu", "vmini_w", "vmini_wu",
++                "vsat_w", "vsat_wu",
++                "vslli_w", "vsrli_w", "vsrai_w", "vrotri_w",
++                "vsrlri_w", "vsrari_w",
++                "vbitclri_w", "vbitseti_w", "vbitrevi_w",
++                "vseqi_w", "vslei_w", "vslei_wu", "vslti_w", "vslti_wu",
++                "vreplvei_w", "vshuf4i_w"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v4i32_ty],
++                                       [llvm_v4i32_ty, llvm_i32_ty],
++                                       [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++foreach inst = ["vaddi_du", "vsubi_du",
++                "vmaxi_d", "vmaxi_du", "vmini_d", "vmini_du",
++                "vsat_d", "vsat_du",
++                "vslli_d", "vsrli_d", "vsrai_d", "vrotri_d",
++                "vsrlri_d", "vsrari_d",
++                "vbitclri_d", "vbitseti_d", "vbitrevi_d",
++                "vseqi_d", "vslei_d", "vslei_du", "vslti_d", "vslti_du",
++                "vreplvei_d"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v2i64_ty],
++                                       [llvm_v2i64_ty, llvm_i32_ty],
++                                       [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++
++foreach inst = ["vhaddw_h_b", "vhaddw_hu_bu", "vhsubw_h_b", "vhsubw_hu_bu",
++                "vaddwev_h_b", "vaddwod_h_b", "vsubwev_h_b", "vsubwod_h_b",
++                "vaddwev_h_bu", "vaddwod_h_bu", "vsubwev_h_bu", "vsubwod_h_bu",
++                "vaddwev_h_bu_b", "vaddwod_h_bu_b",
++                "vmulwev_h_b", "vmulwod_h_b", "vmulwev_h_bu", "vmulwod_h_bu",
++                "vmulwev_h_bu_b", "vmulwod_h_bu_b"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v8i16_ty],
++                                       [llvm_v16i8_ty, llvm_v16i8_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vhaddw_w_h", "vhaddw_wu_hu", "vhsubw_w_h", "vhsubw_wu_hu",
++                "vaddwev_w_h", "vaddwod_w_h", "vsubwev_w_h", "vsubwod_w_h",
++                "vaddwev_w_hu", "vaddwod_w_hu", "vsubwev_w_hu", "vsubwod_w_hu",
++                "vaddwev_w_hu_h", "vaddwod_w_hu_h",
++                "vmulwev_w_h", "vmulwod_w_h", "vmulwev_w_hu", "vmulwod_w_hu",
++                "vmulwev_w_hu_h", "vmulwod_w_hu_h"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v4i32_ty],
++                                       [llvm_v8i16_ty, llvm_v8i16_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vhaddw_d_w", "vhaddw_du_wu", "vhsubw_d_w", "vhsubw_du_wu",
++                "vaddwev_d_w", "vaddwod_d_w", "vsubwev_d_w", "vsubwod_d_w",
++                "vaddwev_d_wu", "vaddwod_d_wu", "vsubwev_d_wu", "vsubwod_d_wu",
++                "vaddwev_d_wu_w", "vaddwod_d_wu_w",
++                "vmulwev_d_w", "vmulwod_d_w", "vmulwev_d_wu", "vmulwod_d_wu",
++                "vmulwev_d_wu_w", "vmulwod_d_wu_w"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v2i64_ty],
++                                       [llvm_v4i32_ty, llvm_v4i32_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vsrln_b_h", "vsran_b_h", "vsrlrn_b_h", "vsrarn_b_h",
++                "vssrln_b_h", "vssran_b_h", "vssrln_bu_h", "vssran_bu_h",
++                "vssrlrn_b_h", "vssrarn_b_h", "vssrlrn_bu_h", "vssrarn_bu_h"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v16i8_ty],
++                                       [llvm_v8i16_ty, llvm_v8i16_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vsrln_h_w", "vsran_h_w", "vsrlrn_h_w", "vsrarn_h_w",
++                "vssrln_h_w", "vssran_h_w", "vssrln_hu_w", "vssran_hu_w",
++                "vssrlrn_h_w", "vssrarn_h_w", "vssrlrn_hu_w", "vssrarn_hu_w"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v8i16_ty],
++                                       [llvm_v4i32_ty, llvm_v4i32_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vsrln_w_d", "vsran_w_d", "vsrlrn_w_d", "vsrarn_w_d",
++                "vssrln_w_d", "vssran_w_d", "vssrln_wu_d", "vssran_wu_d",
++                "vssrlrn_w_d", "vssrarn_w_d", "vssrlrn_wu_d", "vssrarn_wu_d"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v4i32_ty],
++                                       [llvm_v2i64_ty, llvm_v2i64_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vmadd_b", "vmsub_b", "vfrstp_b", "vbitsel_v", "vshuf_b"] in
++  def int_loongarch_lsx_#inst
++    : VecInt<[llvm_v16i8_ty],
++             [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
++             [IntrNoMem]>;
++foreach inst = ["vmadd_h", "vmsub_h", "vfrstp_h", "vshuf_h"] in
++  def int_loongarch_lsx_#inst
++    : VecInt<[llvm_v8i16_ty],
++             [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
++             [IntrNoMem]>;
++foreach inst = ["vmadd_w", "vmsub_w", "vshuf_w"] in
++  def int_loongarch_lsx_#inst
++    : VecInt<[llvm_v4i32_ty],
++             [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
++             [IntrNoMem]>;
++foreach inst = ["vmadd_d", "vmsub_d", "vshuf_d"] in
++  def int_loongarch_lsx_#inst
++    : VecInt<[llvm_v2i64_ty],
++             [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
++             [IntrNoMem]>;
++
++foreach inst = ["vsrlni_b_h", "vsrani_b_h", "vsrlrni_b_h", "vsrarni_b_h",
++                "vssrlni_b_h", "vssrani_b_h", "vssrlni_bu_h", "vssrani_bu_h",
++                "vssrlrni_b_h", "vssrarni_b_h", "vssrlrni_bu_h", "vssrarni_bu_h",
++                "vfrstpi_b", "vbitseli_b", "vextrins_b"] in
++  def int_loongarch_lsx_#inst
++    : VecInt<[llvm_v16i8_ty],
++             [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
++             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
++foreach inst = ["vsrlni_h_w", "vsrani_h_w", "vsrlrni_h_w", "vsrarni_h_w",
++                "vssrlni_h_w", "vssrani_h_w", "vssrlni_hu_w", "vssrani_hu_w",
++                "vssrlrni_h_w", "vssrarni_h_w", "vssrlrni_hu_w", "vssrarni_hu_w",
++                "vfrstpi_h", "vextrins_h"] in
++  def int_loongarch_lsx_#inst
++    : VecInt<[llvm_v8i16_ty],
++             [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty],
++             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
++foreach inst = ["vsrlni_w_d", "vsrani_w_d", "vsrlrni_w_d", "vsrarni_w_d",
++                "vssrlni_w_d", "vssrani_w_d", "vssrlni_wu_d", "vssrani_wu_d",
++                "vssrlrni_w_d", "vssrarni_w_d", "vssrlrni_wu_d", "vssrarni_wu_d",
++                "vpermi_w", "vextrins_w"] in
++  def int_loongarch_lsx_#inst
++    : VecInt<[llvm_v4i32_ty],
++             [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
++             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
++foreach inst = ["vsrlni_d_q", "vsrani_d_q", "vsrlrni_d_q", "vsrarni_d_q",
++                "vssrlni_d_q", "vssrani_d_q", "vssrlni_du_q", "vssrani_du_q",
++                "vssrlrni_d_q", "vssrarni_d_q", "vssrlrni_du_q", "vssrarni_du_q",
++                "vshuf4i_d", "vextrins_d"] in
++  def int_loongarch_lsx_#inst
++    : VecInt<[llvm_v2i64_ty],
++             [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
++             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
++
++foreach inst = ["vmaddwev_h_b", "vmaddwod_h_b", "vmaddwev_h_bu",
++                "vmaddwod_h_bu", "vmaddwev_h_bu_b", "vmaddwod_h_bu_b"] in
++  def int_loongarch_lsx_#inst
++    : VecInt<[llvm_v8i16_ty],
++             [llvm_v8i16_ty, llvm_v16i8_ty, llvm_v16i8_ty],
++             [IntrNoMem]>;
++foreach inst = ["vmaddwev_w_h", "vmaddwod_w_h", "vmaddwev_w_hu",
++                "vmaddwod_w_hu", "vmaddwev_w_hu_h", "vmaddwod_w_hu_h"] in
++  def int_loongarch_lsx_#inst
++    : VecInt<[llvm_v4i32_ty],
++             [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
++             [IntrNoMem]>;
++foreach inst = ["vmaddwev_d_w", "vmaddwod_d_w", "vmaddwev_d_wu",
++                "vmaddwod_d_wu", "vmaddwev_d_wu_w", "vmaddwod_d_wu_w"] in
++  def int_loongarch_lsx_#inst
++    : VecInt<[llvm_v2i64_ty],
++             [llvm_v2i64_ty, llvm_v4i32_ty, llvm_v4i32_ty],
++             [IntrNoMem]>;
++foreach inst = ["vmaddwev_q_d", "vmaddwod_q_d", "vmaddwev_q_du",
++                "vmaddwod_q_du", "vmaddwev_q_du_d", "vmaddwod_q_du_d"] in
++  def int_loongarch_lsx_#inst
++    : VecInt<[llvm_v2i64_ty],
++             [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
++             [IntrNoMem]>;
++
++foreach inst = ["vsllwil_h_b", "vsllwil_hu_bu"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v8i16_ty],
++                                       [llvm_v16i8_ty, llvm_i32_ty],
++                                       [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++foreach inst = ["vsllwil_w_h", "vsllwil_wu_hu"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v4i32_ty],
++                                       [llvm_v8i16_ty, llvm_i32_ty],
++                                       [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++foreach inst = ["vsllwil_d_w", "vsllwil_du_wu"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v2i64_ty],
++                                       [llvm_v4i32_ty, llvm_i32_ty],
++                                       [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++
++foreach inst = ["vneg_b", "vmskltz_b", "vmskgez_b", "vmsknz_b",
++                "vclo_b", "vclz_b", "vpcnt_b"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v16i8_ty], [llvm_v16i8_ty],
++                                       [IntrNoMem]>;
++foreach inst = ["vneg_h", "vmskltz_h", "vclo_h", "vclz_h", "vpcnt_h"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v8i16_ty], [llvm_v8i16_ty],
++                                       [IntrNoMem]>;
++foreach inst = ["vneg_w", "vmskltz_w", "vclo_w", "vclz_w", "vpcnt_w"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v4i32_ty], [llvm_v4i32_ty],
++                                       [IntrNoMem]>;
++foreach inst = ["vneg_d", "vexth_q_d", "vexth_qu_du", "vmskltz_d",
++                "vextl_q_d", "vextl_qu_du", "vclo_d", "vclz_d", "vpcnt_d"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v2i64_ty], [llvm_v2i64_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vexth_h_b", "vexth_hu_bu"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v8i16_ty], [llvm_v16i8_ty],
++                                       [IntrNoMem]>;
++foreach inst = ["vexth_w_h", "vexth_wu_hu"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v4i32_ty], [llvm_v8i16_ty],
++                                       [IntrNoMem]>;
++foreach inst = ["vexth_d_w", "vexth_du_wu"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v2i64_ty], [llvm_v4i32_ty],
++                                       [IntrNoMem]>;
++
++def int_loongarch_lsx_vldi : VecInt<[llvm_v2i64_ty], [llvm_i32_ty],
++                                    [IntrNoMem, ImmArg<ArgIndex<0>>]>;
++def int_loongarch_lsx_vrepli_b : VecInt<[llvm_v16i8_ty], [llvm_i32_ty],
++                                        [IntrNoMem, ImmArg<ArgIndex<0>>]>;
++def int_loongarch_lsx_vrepli_h : VecInt<[llvm_v8i16_ty], [llvm_i32_ty],
++                                        [IntrNoMem, ImmArg<ArgIndex<0>>]>;
++def int_loongarch_lsx_vrepli_w : VecInt<[llvm_v4i32_ty], [llvm_i32_ty],
++                                        [IntrNoMem, ImmArg<ArgIndex<0>>]>;
++def int_loongarch_lsx_vrepli_d : VecInt<[llvm_v2i64_ty], [llvm_i32_ty],
++                                        [IntrNoMem, ImmArg<ArgIndex<0>>]>;
++
++def int_loongarch_lsx_vreplgr2vr_b : VecInt<[llvm_v16i8_ty], [llvm_i32_ty],
++                                            [IntrNoMem]>;
++def int_loongarch_lsx_vreplgr2vr_h : VecInt<[llvm_v8i16_ty], [llvm_i32_ty],
++                                            [IntrNoMem]>;
++def int_loongarch_lsx_vreplgr2vr_w : VecInt<[llvm_v4i32_ty], [llvm_i32_ty],
++                                            [IntrNoMem]>;
++def int_loongarch_lsx_vreplgr2vr_d : VecInt<[llvm_v2i64_ty], [llvm_i64_ty],
++                                            [IntrNoMem]>;
++
++def int_loongarch_lsx_vinsgr2vr_b
++  : VecInt<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty],
++           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
++def int_loongarch_lsx_vinsgr2vr_h
++  : VecInt<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty, llvm_i32_ty],
++           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
++def int_loongarch_lsx_vinsgr2vr_w
++  : VecInt<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty, llvm_i32_ty],
++           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
++def int_loongarch_lsx_vinsgr2vr_d
++  : VecInt<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i64_ty, llvm_i32_ty],
++           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
++
++def int_loongarch_lsx_vreplve_b
++  : VecInt<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
++def int_loongarch_lsx_vreplve_h
++  : VecInt<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
++def int_loongarch_lsx_vreplve_w
++  : VecInt<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
++def int_loongarch_lsx_vreplve_d
++  : VecInt<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
++
++foreach inst = ["vpickve2gr_b", "vpickve2gr_bu" ] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_i32_ty],
++                                       [llvm_v16i8_ty, llvm_i32_ty],
++                                       [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++foreach inst = ["vpickve2gr_h", "vpickve2gr_hu" ] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_i32_ty],
++                                       [llvm_v8i16_ty, llvm_i32_ty],
++                                       [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++foreach inst = ["vpickve2gr_w", "vpickve2gr_wu" ] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_i32_ty],
++                                       [llvm_v4i32_ty, llvm_i32_ty],
++                                       [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++foreach inst = ["vpickve2gr_d", "vpickve2gr_du" ] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_i64_ty],
++                                       [llvm_v2i64_ty, llvm_i32_ty],
++                                       [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++
++def int_loongarch_lsx_bz_b : VecInt<[llvm_i32_ty], [llvm_v16i8_ty],
++                                    [IntrNoMem]>;
++def int_loongarch_lsx_bz_h : VecInt<[llvm_i32_ty], [llvm_v8i16_ty],
++                                    [IntrNoMem]>;
++def int_loongarch_lsx_bz_w : VecInt<[llvm_i32_ty], [llvm_v4i32_ty],
++                                    [IntrNoMem]>;
++def int_loongarch_lsx_bz_d : VecInt<[llvm_i32_ty], [llvm_v2i64_ty],
++                                    [IntrNoMem]>;
++def int_loongarch_lsx_bz_v : VecInt<[llvm_i32_ty], [llvm_v16i8_ty],
++                                    [IntrNoMem]>;
++
++def int_loongarch_lsx_bnz_v : VecInt<[llvm_i32_ty], [llvm_v16i8_ty],
++                                     [IntrNoMem]>;
++def int_loongarch_lsx_bnz_b : VecInt<[llvm_i32_ty], [llvm_v16i8_ty],
++                                     [IntrNoMem]>;
++def int_loongarch_lsx_bnz_h : VecInt<[llvm_i32_ty], [llvm_v8i16_ty],
++                                     [IntrNoMem]>;
++def int_loongarch_lsx_bnz_w : VecInt<[llvm_i32_ty], [llvm_v4i32_ty],
++                                     [IntrNoMem]>;
++def int_loongarch_lsx_bnz_d : VecInt<[llvm_i32_ty], [llvm_v2i64_ty],
++                                     [IntrNoMem]>;
++
++// LSX Float
++
++foreach inst = ["vfadd_s", "vfsub_s", "vfmul_s", "vfdiv_s",
++                "vfmax_s", "vfmin_s", "vfmaxa_s", "vfmina_s"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v4f32_ty],
++                                       [llvm_v4f32_ty, llvm_v4f32_ty],
++                                       [IntrNoMem]>;
++foreach inst = ["vfadd_d", "vfsub_d", "vfmul_d", "vfdiv_d",
++                "vfmax_d", "vfmin_d", "vfmaxa_d", "vfmina_d"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v2f64_ty],
++                                       [llvm_v2f64_ty, llvm_v2f64_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vfmadd_s", "vfmsub_s", "vfnmadd_s", "vfnmsub_s"] in
++  def int_loongarch_lsx_#inst
++    : VecInt<[llvm_v4f32_ty],
++             [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
++             [IntrNoMem]>;
++foreach inst = ["vfmadd_d", "vfmsub_d", "vfnmadd_d", "vfnmsub_d"] in
++  def int_loongarch_lsx_#inst
++    : VecInt<[llvm_v2f64_ty],
++             [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
++             [IntrNoMem]>;
++
++foreach inst = ["vflogb_s", "vfsqrt_s", "vfrecip_s", "vfrsqrt_s", "vfrint_s",
++                "vfrintrne_s", "vfrintrz_s", "vfrintrp_s", "vfrintrm_s"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v4f32_ty], [llvm_v4f32_ty],
++                                       [IntrNoMem]>;
++foreach inst = ["vflogb_d", "vfsqrt_d", "vfrecip_d", "vfrsqrt_d", "vfrint_d",
++                "vfrintrne_d", "vfrintrz_d", "vfrintrp_d", "vfrintrm_d"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v2f64_ty], [llvm_v2f64_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vfcvtl_s_h", "vfcvth_s_h"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v4f32_ty], [llvm_v8i16_ty],
++                                       [IntrNoMem]>;
++foreach inst = ["vfcvtl_d_s", "vfcvth_d_s"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v2f64_ty], [llvm_v4f32_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vftintrne_w_s", "vftintrz_w_s", "vftintrp_w_s", "vftintrm_w_s",
++                "vftint_w_s", "vftintrz_wu_s", "vftint_wu_s", "vfclass_s"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v4i32_ty], [llvm_v4f32_ty],
++                                       [IntrNoMem]>;
++foreach inst = ["vftintrne_l_d", "vftintrz_l_d", "vftintrp_l_d", "vftintrm_l_d",
++                "vftint_l_d", "vftintrz_lu_d", "vftint_lu_d", "vfclass_d"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v2i64_ty], [llvm_v2f64_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vftintrnel_l_s", "vftintrneh_l_s", "vftintrzl_l_s",
++                "vftintrzh_l_s", "vftintrpl_l_s", "vftintrph_l_s",
++                "vftintrml_l_s", "vftintrmh_l_s", "vftintl_l_s",
++                "vftinth_l_s"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v2i64_ty], [llvm_v4f32_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vffint_s_w", "vffint_s_wu"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v4f32_ty], [llvm_v4i32_ty],
++                                       [IntrNoMem]>;
++foreach inst = ["vffint_d_l", "vffint_d_lu"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v2f64_ty], [llvm_v2i64_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vffintl_d_w", "vffinth_d_w"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v2f64_ty], [llvm_v4i32_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vffint_s_l"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v4f32_ty],
++                                       [llvm_v2i64_ty, llvm_v2i64_ty],
++                                       [IntrNoMem]>;
++foreach inst = ["vftintrne_w_d", "vftintrz_w_d", "vftintrp_w_d", "vftintrm_w_d",
++                "vftint_w_d"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v4i32_ty],
++                                       [llvm_v2f64_ty, llvm_v2f64_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vfcvt_h_s"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v8i16_ty],
++                                       [llvm_v4f32_ty, llvm_v4f32_ty],
++                                       [IntrNoMem]>;
++foreach inst = ["vfcvt_s_d"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v4f32_ty],
++                                       [llvm_v2f64_ty, llvm_v2f64_ty],
++                                       [IntrNoMem]>;
++
++foreach inst = ["vfcmp_caf_s", "vfcmp_cun_s", "vfcmp_ceq_s", "vfcmp_cueq_s",
++                "vfcmp_clt_s", "vfcmp_cult_s", "vfcmp_cle_s", "vfcmp_cule_s",
++                "vfcmp_cne_s", "vfcmp_cor_s", "vfcmp_cune_s",
++                "vfcmp_saf_s", "vfcmp_sun_s", "vfcmp_seq_s", "vfcmp_sueq_s",
++                "vfcmp_slt_s", "vfcmp_sult_s", "vfcmp_sle_s", "vfcmp_sule_s",
++                "vfcmp_sne_s", "vfcmp_sor_s", "vfcmp_sune_s"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v4i32_ty],
++                                       [llvm_v4f32_ty, llvm_v4f32_ty],
++                                       [IntrNoMem]>;
++foreach inst = ["vfcmp_caf_d", "vfcmp_cun_d", "vfcmp_ceq_d", "vfcmp_cueq_d",
++                "vfcmp_clt_d", "vfcmp_cult_d", "vfcmp_cle_d", "vfcmp_cule_d",
++                "vfcmp_cne_d", "vfcmp_cor_d", "vfcmp_cune_d",
++                "vfcmp_saf_d", "vfcmp_sun_d", "vfcmp_seq_d", "vfcmp_sueq_d",
++                "vfcmp_slt_d", "vfcmp_sult_d", "vfcmp_sle_d", "vfcmp_sule_d",
++                "vfcmp_sne_d", "vfcmp_sor_d", "vfcmp_sune_d"] in
++  def int_loongarch_lsx_#inst : VecInt<[llvm_v2i64_ty],
++                                       [llvm_v2f64_ty, llvm_v2f64_ty],
++                                       [IntrNoMem]>;
++
++// LSX load/store
++def int_loongarch_lsx_vld
++  : VecInt<[llvm_v16i8_ty], [llvm_ptr_ty, llvm_i32_ty],
++           [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>]>;
++def int_loongarch_lsx_vldx
++  : VecInt<[llvm_v16i8_ty], [llvm_ptr_ty, llvm_i64_ty],
++           [IntrReadMem, IntrArgMemOnly]>;
++def int_loongarch_lsx_vldrepl_b
++  : VecInt<[llvm_v16i8_ty], [llvm_ptr_ty, llvm_i32_ty],
++           [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>]>;
++def int_loongarch_lsx_vldrepl_h
++  : VecInt<[llvm_v8i16_ty], [llvm_ptr_ty, llvm_i32_ty],
++           [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>]>;
++def int_loongarch_lsx_vldrepl_w
++  : VecInt<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty],
++           [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>]>;
++def int_loongarch_lsx_vldrepl_d
++  : VecInt<[llvm_v2i64_ty], [llvm_ptr_ty, llvm_i32_ty],
++           [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>]>;
++
++def int_loongarch_lsx_vst
++  : VecInt<[], [llvm_v16i8_ty, llvm_ptr_ty, llvm_i32_ty],
++           [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>]>;
++def int_loongarch_lsx_vstx
++  : VecInt<[], [llvm_v16i8_ty, llvm_ptr_ty, llvm_i64_ty],
++           [IntrWriteMem, IntrArgMemOnly]>;
++def int_loongarch_lsx_vstelm_b
++  : VecInt<[], [llvm_v16i8_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
++           [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
++def int_loongarch_lsx_vstelm_h
++  : VecInt<[], [llvm_v8i16_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
++           [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
++def int_loongarch_lsx_vstelm_w
++  : VecInt<[], [llvm_v4i32_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
++           [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
++def int_loongarch_lsx_vstelm_d
++  : VecInt<[], [llvm_v2i64_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
++           [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
++
++} // TargetPrefix = "loongarch"
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
+index ae7167cb5ce7..f55184019988 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
+@@ -15,6 +15,7 @@
+ #include "MCTargetDesc/LoongArchMCTargetDesc.h"
+ #include "MCTargetDesc/LoongArchMatInt.h"
+ #include "llvm/Support/KnownBits.h"
++#include "llvm/Support/raw_ostream.h"
+ 
+ using namespace llvm;
+ 
+@@ -75,7 +76,14 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) {
+     ReplaceNode(Node, CurDAG->getMachineNode(ADDIOp, DL, VT, TFI, Imm));
+     return;
+   }
+-    // TODO: Add selection nodes needed later.
++  case ISD::BITCAST: {
++    if (VT.is128BitVector() || VT.is512BitVector()) {
++      ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
++      CurDAG->RemoveDeadNode(Node);
++      return;
++    }
++    break;
++  }
+   }
+ 
+   // Select the default instruction.
+@@ -262,6 +270,96 @@ bool LoongArchDAGToDAGISel::selectZExti32(SDValue N, SDValue &Val) {
+   return false;
+ }
+ 
++bool LoongArchDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm,
++                                         unsigned MinSizeInBits) const {
++  if (!Subtarget->hasExtLSX())
++    return false;
++
++  BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N);
++
++  if (!Node)
++    return false;
++
++  APInt SplatValue, SplatUndef;
++  unsigned SplatBitSize;
++  bool HasAnyUndefs;
++
++  if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
++                             MinSizeInBits, /*IsBigEndian=*/false))
++    return false;
++
++  Imm = SplatValue;
++
++  return true;
++}
++
++template <unsigned ImmBitSize, bool IsSigned>
++bool LoongArchDAGToDAGISel::selectVSplatImm(SDValue N, SDValue &SplatVal) {
++  APInt ImmValue;
++  EVT EltTy = N->getValueType(0).getVectorElementType();
++
++  if (N->getOpcode() == ISD::BITCAST)
++    N = N->getOperand(0);
++
++  if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
++      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
++    if (IsSigned && ImmValue.isSignedIntN(ImmBitSize)) {
++      SplatVal = CurDAG->getTargetConstant(ImmValue.getSExtValue(), SDLoc(N),
++                                           Subtarget->getGRLenVT());
++      return true;
++    }
++    if (!IsSigned && ImmValue.isIntN(ImmBitSize)) {
++      SplatVal = CurDAG->getTargetConstant(ImmValue.getZExtValue(), SDLoc(N),
++                                           Subtarget->getGRLenVT());
++      return true;
++    }
++  }
++
++  return false;
++}
++
++bool LoongArchDAGToDAGISel::selectVSplatUimmInvPow2(SDValue N,
++                                                    SDValue &SplatImm) const {
++  APInt ImmValue;
++  EVT EltTy = N->getValueType(0).getVectorElementType();
++
++  if (N->getOpcode() == ISD::BITCAST)
++    N = N->getOperand(0);
++
++  if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
++      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
++    int32_t Log2 = (~ImmValue).exactLogBase2();
++
++    if (Log2 != -1) {
++      SplatImm = CurDAG->getTargetConstant(Log2, SDLoc(N), EltTy);
++      return true;
++    }
++  }
++
++  return false;
++}
++
++bool LoongArchDAGToDAGISel::selectVSplatUimmPow2(SDValue N,
++                                                 SDValue &SplatImm) const {
++  APInt ImmValue;
++  EVT EltTy = N->getValueType(0).getVectorElementType();
++
++  if (N->getOpcode() == ISD::BITCAST)
++    N = N->getOperand(0);
++
++  if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
++      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
++    int32_t Log2 = ImmValue.exactLogBase2();
++
++    if (Log2 != -1) {
++      SplatImm = CurDAG->getTargetConstant(Log2, SDLoc(N), EltTy);
++      return true;
++    }
++  }
++
++  return false;
++}
++
+ // This pass converts a legalized DAG into a LoongArch-specific DAG, ready
+ // for instruction scheduling.
+ FunctionPass *llvm::createLoongArchISelDag(LoongArchTargetMachine &TM) {
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h
+index 3099407aea3e..5e3d6ccc3755 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h
++++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h
+@@ -56,6 +56,14 @@ public:
+   bool selectSExti32(SDValue N, SDValue &Val);
+   bool selectZExti32(SDValue N, SDValue &Val);
+ 
++  bool selectVSplat(SDNode *N, APInt &Imm, unsigned MinSizeInBits) const;
++
++  template <unsigned ImmSize, bool IsSigned = false>
++  bool selectVSplatImm(SDValue N, SDValue &SplatVal);
++
++  bool selectVSplatUimmInvPow2(SDValue N, SDValue &SplatImm) const;
++  bool selectVSplatUimmPow2(SDValue N, SDValue &SplatImm) const;
++
+ // Include the pieces autogenerated from the target description.
+ #include "LoongArchGenDAGISel.inc"
+ };
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index db5961fc501a..c05133647929 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -62,6 +62,13 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+                     MVT::v4i64})
+       addRegisterClass(VT, &LoongArch::LASX256RegClass);
+ 
++  static const MVT::SimpleValueType LSXVTs[] = {
++      MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v4f32, MVT::v2f64};
++
++  if (Subtarget.hasExtLSX())
++    for (MVT VT : LSXVTs)
++      addRegisterClass(VT, &LoongArch::LSX128RegClass);
++
+   setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, GRLenVT,
+                    MVT::i1, Promote);
+ 
+@@ -109,6 +116,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+     setOperationAction(ISD::READ_REGISTER, MVT::i32, Custom);
+     setOperationAction(ISD::WRITE_REGISTER, MVT::i32, Custom);
++    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i32, Custom);
+     if (Subtarget.hasBasicF() && !Subtarget.hasBasicD())
+       setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+     if (Subtarget.hasBasicF())
+@@ -138,6 +146,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+     setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
+     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+     setOperationAction(ISD::INTRINSIC_VOID, MVT::i64, Custom);
++    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+   }
+ 
+   static const ISD::CondCode FPCCToExpand[] = {
+@@ -194,6 +203,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+     setOperationAction(ISD::UINT_TO_FP, GRLenVT, Custom);
+   }
+ 
++  if (Subtarget.hasExtLSX())
++    setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN},
++                       {MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8}, Legal);
++
+   // Compute derived properties from the register classes.
+   computeRegisterProperties(Subtarget.getRegisterInfo());
+ 
+@@ -215,6 +228,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+   setTargetDAGCombine(ISD::AND);
+   setTargetDAGCombine(ISD::OR);
+   setTargetDAGCombine(ISD::SRL);
++  if (Subtarget.hasExtLSX())
++    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+ }
+ 
+ bool LoongArchTargetLowering::isOffsetFoldingLegal(
+@@ -652,9 +667,24 @@ LoongArchTargetLowering::lowerGlobalTLSAddress(SDValue Op,
+   return Addr;
+ }
+ 
++template <unsigned N>
++static SDValue checkIntrinsicImmArg(SDValue Op, unsigned ImmOp,
++                                    SelectionDAG &DAG, bool IsSigned = false) {
++  auto *CImm = cast<ConstantSDNode>(Op->getOperand(ImmOp));
++  // Check the ImmArg.
++  if ((IsSigned && !isInt<N>(CImm->getSExtValue())) ||
++      (!IsSigned && !isUInt<N>(CImm->getZExtValue()))) {
++    DAG.getContext()->emitError(Op->getOperationName(0) +
++                                ": argument out of range.");
++    return DAG.getNode(ISD::UNDEF, SDLoc(Op), Op.getValueType());
++  }
++  return SDValue();
++}
++
+ SDValue
+ LoongArchTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                  SelectionDAG &DAG) const {
++  SDLoc DL(Op);
+   switch (Op.getConstantOperandVal(0)) {
+   default:
+     return SDValue(); // Don't custom lower most intrinsics.
+@@ -662,6 +692,141 @@ LoongArchTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+     EVT PtrVT = getPointerTy(DAG.getDataLayout());
+     return DAG.getRegister(LoongArch::R2, PtrVT);
+   }
++  case Intrinsic::loongarch_lsx_vpickve2gr_d:
++  case Intrinsic::loongarch_lsx_vpickve2gr_du:
++  case Intrinsic::loongarch_lsx_vreplvei_d:
++    return checkIntrinsicImmArg<1>(Op, 2, DAG);
++  case Intrinsic::loongarch_lsx_vreplvei_w:
++    return checkIntrinsicImmArg<2>(Op, 2, DAG);
++  case Intrinsic::loongarch_lsx_vsat_b:
++  case Intrinsic::loongarch_lsx_vsat_bu:
++  case Intrinsic::loongarch_lsx_vrotri_b:
++  case Intrinsic::loongarch_lsx_vsllwil_h_b:
++  case Intrinsic::loongarch_lsx_vsllwil_hu_bu:
++  case Intrinsic::loongarch_lsx_vsrlri_b:
++  case Intrinsic::loongarch_lsx_vsrari_b:
++  case Intrinsic::loongarch_lsx_vreplvei_h:
++    return checkIntrinsicImmArg<3>(Op, 2, DAG);
++  case Intrinsic::loongarch_lsx_vsat_h:
++  case Intrinsic::loongarch_lsx_vsat_hu:
++  case Intrinsic::loongarch_lsx_vrotri_h:
++  case Intrinsic::loongarch_lsx_vsllwil_w_h:
++  case Intrinsic::loongarch_lsx_vsllwil_wu_hu:
++  case Intrinsic::loongarch_lsx_vsrlri_h:
++  case Intrinsic::loongarch_lsx_vsrari_h:
++  case Intrinsic::loongarch_lsx_vreplvei_b:
++    return checkIntrinsicImmArg<4>(Op, 2, DAG);
++  case Intrinsic::loongarch_lsx_vsrlni_b_h:
++  case Intrinsic::loongarch_lsx_vsrani_b_h:
++  case Intrinsic::loongarch_lsx_vsrlrni_b_h:
++  case Intrinsic::loongarch_lsx_vsrarni_b_h:
++  case Intrinsic::loongarch_lsx_vssrlni_b_h:
++  case Intrinsic::loongarch_lsx_vssrani_b_h:
++  case Intrinsic::loongarch_lsx_vssrlni_bu_h:
++  case Intrinsic::loongarch_lsx_vssrani_bu_h:
++  case Intrinsic::loongarch_lsx_vssrlrni_b_h:
++  case Intrinsic::loongarch_lsx_vssrarni_b_h:
++  case Intrinsic::loongarch_lsx_vssrlrni_bu_h:
++  case Intrinsic::loongarch_lsx_vssrarni_bu_h:
++    return checkIntrinsicImmArg<4>(Op, 3, DAG);
++  case Intrinsic::loongarch_lsx_vsat_w:
++  case Intrinsic::loongarch_lsx_vsat_wu:
++  case Intrinsic::loongarch_lsx_vrotri_w:
++  case Intrinsic::loongarch_lsx_vsllwil_d_w:
++  case Intrinsic::loongarch_lsx_vsllwil_du_wu:
++  case Intrinsic::loongarch_lsx_vsrlri_w:
++  case Intrinsic::loongarch_lsx_vsrari_w:
++  case Intrinsic::loongarch_lsx_vslei_bu:
++  case Intrinsic::loongarch_lsx_vslei_hu:
++  case Intrinsic::loongarch_lsx_vslei_wu:
++  case Intrinsic::loongarch_lsx_vslei_du:
++  case Intrinsic::loongarch_lsx_vslti_bu:
++  case Intrinsic::loongarch_lsx_vslti_hu:
++  case Intrinsic::loongarch_lsx_vslti_wu:
++  case Intrinsic::loongarch_lsx_vslti_du:
++  case Intrinsic::loongarch_lsx_vbsll_v:
++  case Intrinsic::loongarch_lsx_vbsrl_v:
++    return checkIntrinsicImmArg<5>(Op, 2, DAG);
++  case Intrinsic::loongarch_lsx_vseqi_b:
++  case Intrinsic::loongarch_lsx_vseqi_h:
++  case Intrinsic::loongarch_lsx_vseqi_w:
++  case Intrinsic::loongarch_lsx_vseqi_d:
++  case Intrinsic::loongarch_lsx_vslei_b:
++  case Intrinsic::loongarch_lsx_vslei_h:
++  case Intrinsic::loongarch_lsx_vslei_w:
++  case Intrinsic::loongarch_lsx_vslei_d:
++  case Intrinsic::loongarch_lsx_vslti_b:
++  case Intrinsic::loongarch_lsx_vslti_h:
++  case Intrinsic::loongarch_lsx_vslti_w:
++  case Intrinsic::loongarch_lsx_vslti_d:
++    return checkIntrinsicImmArg<5>(Op, 2, DAG, /*IsSigned=*/true);
++  case Intrinsic::loongarch_lsx_vsrlni_h_w:
++  case Intrinsic::loongarch_lsx_vsrani_h_w:
++  case Intrinsic::loongarch_lsx_vsrlrni_h_w:
++  case Intrinsic::loongarch_lsx_vsrarni_h_w:
++  case Intrinsic::loongarch_lsx_vssrlni_h_w:
++  case Intrinsic::loongarch_lsx_vssrani_h_w:
++  case Intrinsic::loongarch_lsx_vssrlni_hu_w:
++  case Intrinsic::loongarch_lsx_vssrani_hu_w:
++  case Intrinsic::loongarch_lsx_vssrlrni_h_w:
++  case Intrinsic::loongarch_lsx_vssrarni_h_w:
++  case Intrinsic::loongarch_lsx_vssrlrni_hu_w:
++  case Intrinsic::loongarch_lsx_vssrarni_hu_w:
++  case Intrinsic::loongarch_lsx_vfrstpi_b:
++  case Intrinsic::loongarch_lsx_vfrstpi_h:
++    return checkIntrinsicImmArg<5>(Op, 3, DAG);
++  case Intrinsic::loongarch_lsx_vsat_d:
++  case Intrinsic::loongarch_lsx_vsat_du:
++  case Intrinsic::loongarch_lsx_vrotri_d:
++  case Intrinsic::loongarch_lsx_vsrlri_d:
++  case Intrinsic::loongarch_lsx_vsrari_d:
++    return checkIntrinsicImmArg<6>(Op, 2, DAG);
++  case Intrinsic::loongarch_lsx_vsrlni_w_d:
++  case Intrinsic::loongarch_lsx_vsrani_w_d:
++  case Intrinsic::loongarch_lsx_vsrlrni_w_d:
++  case Intrinsic::loongarch_lsx_vsrarni_w_d:
++  case Intrinsic::loongarch_lsx_vssrlni_w_d:
++  case Intrinsic::loongarch_lsx_vssrani_w_d:
++  case Intrinsic::loongarch_lsx_vssrlni_wu_d:
++  case Intrinsic::loongarch_lsx_vssrani_wu_d:
++  case Intrinsic::loongarch_lsx_vssrlrni_w_d:
++  case Intrinsic::loongarch_lsx_vssrarni_w_d:
++  case Intrinsic::loongarch_lsx_vssrlrni_wu_d:
++  case Intrinsic::loongarch_lsx_vssrarni_wu_d:
++    return checkIntrinsicImmArg<6>(Op, 3, DAG);
++  case Intrinsic::loongarch_lsx_vsrlni_d_q:
++  case Intrinsic::loongarch_lsx_vsrani_d_q:
++  case Intrinsic::loongarch_lsx_vsrlrni_d_q:
++  case Intrinsic::loongarch_lsx_vsrarni_d_q:
++  case Intrinsic::loongarch_lsx_vssrlni_d_q:
++  case Intrinsic::loongarch_lsx_vssrani_d_q:
++  case Intrinsic::loongarch_lsx_vssrlni_du_q:
++  case Intrinsic::loongarch_lsx_vssrani_du_q:
++  case Intrinsic::loongarch_lsx_vssrlrni_d_q:
++  case Intrinsic::loongarch_lsx_vssrarni_d_q:
++  case Intrinsic::loongarch_lsx_vssrlrni_du_q:
++  case Intrinsic::loongarch_lsx_vssrarni_du_q:
++    return checkIntrinsicImmArg<7>(Op, 3, DAG);
++  case Intrinsic::loongarch_lsx_vnori_b:
++  case Intrinsic::loongarch_lsx_vshuf4i_b:
++  case Intrinsic::loongarch_lsx_vshuf4i_h:
++  case Intrinsic::loongarch_lsx_vshuf4i_w:
++    return checkIntrinsicImmArg<8>(Op, 2, DAG);
++  case Intrinsic::loongarch_lsx_vshuf4i_d:
++  case Intrinsic::loongarch_lsx_vpermi_w:
++  case Intrinsic::loongarch_lsx_vbitseli_b:
++  case Intrinsic::loongarch_lsx_vextrins_b:
++  case Intrinsic::loongarch_lsx_vextrins_h:
++  case Intrinsic::loongarch_lsx_vextrins_w:
++  case Intrinsic::loongarch_lsx_vextrins_d:
++    return checkIntrinsicImmArg<8>(Op, 3, DAG);
++  case Intrinsic::loongarch_lsx_vrepli_b:
++  case Intrinsic::loongarch_lsx_vrepli_h:
++  case Intrinsic::loongarch_lsx_vrepli_w:
++  case Intrinsic::loongarch_lsx_vrepli_d:
++    return checkIntrinsicImmArg<10>(Op, 1, DAG, /*IsSigned=*/true);
++  case Intrinsic::loongarch_lsx_vldi:
++    return checkIntrinsicImmArg<13>(Op, 1, DAG, /*IsSigned=*/true);
+   }
+ }
+ 
+@@ -757,6 +922,29 @@ LoongArchTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
+                : DAG.getNode(LoongArchISD::MOVFCSR2GR, DL, {VT, MVT::Other},
+                              {Chain, DAG.getConstant(Imm, DL, GRLenVT)});
+   }
++  case Intrinsic::loongarch_lsx_vld:
++  case Intrinsic::loongarch_lsx_vldrepl_b:
++    return !isInt<12>(cast<ConstantSDNode>(Op.getOperand(3))->getSExtValue())
++               ? emitIntrinsicWithChainErrorMessage(Op, ErrorMsgOOR, DAG)
++               : SDValue();
++  case Intrinsic::loongarch_lsx_vldrepl_h:
++    return !isShiftedInt<11, 1>(
++               cast<ConstantSDNode>(Op.getOperand(3))->getSExtValue())
++               ? emitIntrinsicWithChainErrorMessage(
++                     Op, "argument out of range or not a multiple of 2", DAG)
++               : SDValue();
++  case Intrinsic::loongarch_lsx_vldrepl_w:
++    return !isShiftedInt<10, 2>(
++               cast<ConstantSDNode>(Op.getOperand(3))->getSExtValue())
++               ? emitIntrinsicWithChainErrorMessage(
++                     Op, "argument out of range or not a multiple of 4", DAG)
++               : SDValue();
++  case Intrinsic::loongarch_lsx_vldrepl_d:
++    return !isShiftedInt<9, 3>(
++               cast<ConstantSDNode>(Op.getOperand(3))->getSExtValue())
++               ? emitIntrinsicWithChainErrorMessage(
++                     Op, "argument out of range or not a multiple of 8", DAG)
++               : SDValue();
+   }
+ }
+ 
+@@ -875,6 +1063,36 @@ SDValue LoongArchTargetLowering::lowerINTRINSIC_VOID(SDValue Op,
+            : !isUInt<8>(Imm) ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)
+                              : Op;
+   }
++  case Intrinsic::loongarch_lsx_vst:
++    return !isInt<12>(cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue())
++               ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)
++               : SDValue();
++  case Intrinsic::loongarch_lsx_vstelm_b:
++    return (!isInt<8>(cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
++            !isUInt<4>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
++               ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)
++               : SDValue();
++  case Intrinsic::loongarch_lsx_vstelm_h:
++    return (!isShiftedInt<8, 1>(
++                cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
++            !isUInt<3>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
++               ? emitIntrinsicErrorMessage(
++                     Op, "argument out of range or not a multiple of 2", DAG)
++               : SDValue();
++  case Intrinsic::loongarch_lsx_vstelm_w:
++    return (!isShiftedInt<8, 2>(
++                cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
++            !isUInt<2>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
++               ? emitIntrinsicErrorMessage(
++                     Op, "argument out of range or not a multiple of 4", DAG)
++               : SDValue();
++  case Intrinsic::loongarch_lsx_vstelm_d:
++    return (!isShiftedInt<8, 3>(
++                cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
++            !isUInt<1>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
++               ? emitIntrinsicErrorMessage(
++                     Op, "argument out of range or not a multiple of 8", DAG)
++               : SDValue();
+   }
+ }
+ 
+@@ -1026,16 +1244,110 @@ static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG, int NumOp,
+   return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewRes);
+ }
+ 
+-// Helper function that emits error message for intrinsics with chain and return
+-// a UNDEF and the chain as the results.
+-static void emitErrorAndReplaceIntrinsicWithChainResults(
++// Helper function that emits error message for intrinsics with/without chain
++// and return a UNDEF or and the chain as the results.
++static void emitErrorAndReplaceIntrinsicResults(
+     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG,
+-    StringRef ErrorMsg) {
++    StringRef ErrorMsg, bool WithChain = true) {
+   DAG.getContext()->emitError(N->getOperationName(0) + ": " + ErrorMsg + ".");
+   Results.push_back(DAG.getUNDEF(N->getValueType(0)));
++  if (!WithChain)
++    return;
+   Results.push_back(N->getOperand(0));
+ }
+ 
++template <unsigned N>
++static void
++replaceVPICKVE2GRResults(SDNode *Node, SmallVectorImpl<SDValue> &Results,
++                         SelectionDAG &DAG, const LoongArchSubtarget &Subtarget,
++                         unsigned ResOp) {
++  const StringRef ErrorMsgOOR = "argument out of range";
++  unsigned Imm = cast<ConstantSDNode>(Node->getOperand(2))->getZExtValue();
++  if (!isUInt<N>(Imm)) {
++    emitErrorAndReplaceIntrinsicResults(Node, Results, DAG, ErrorMsgOOR,
++                                        /*WithChain=*/false);
++    return;
++  }
++  SDLoc DL(Node);
++  SDValue Vec = Node->getOperand(1);
++
++  SDValue PickElt =
++      DAG.getNode(ResOp, DL, Subtarget.getGRLenVT(), Vec,
++                  DAG.getConstant(Imm, DL, Subtarget.getGRLenVT()),
++                  DAG.getValueType(Vec.getValueType().getVectorElementType()));
++  Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, Node->getValueType(0),
++                                PickElt.getValue(0)));
++}
++
++static void replaceVecCondBranchResults(SDNode *N,
++                                        SmallVectorImpl<SDValue> &Results,
++                                        SelectionDAG &DAG,
++                                        const LoongArchSubtarget &Subtarget,
++                                        unsigned ResOp) {
++  SDLoc DL(N);
++  SDValue Vec = N->getOperand(1);
++
++  SDValue CB = DAG.getNode(ResOp, DL, Subtarget.getGRLenVT(), Vec);
++  Results.push_back(
++      DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), CB.getValue(0)));
++}
++
++static void
++replaceINTRINSIC_WO_CHAINResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
++                                 SelectionDAG &DAG,
++                                 const LoongArchSubtarget &Subtarget) {
++  switch (N->getConstantOperandVal(0)) {
++  default:
++    llvm_unreachable("Unexpected Intrinsic.");
++  case Intrinsic::loongarch_lsx_vpickve2gr_b:
++    replaceVPICKVE2GRResults<4>(N, Results, DAG, Subtarget,
++                                LoongArchISD::VPICK_SEXT_ELT);
++    break;
++  case Intrinsic::loongarch_lsx_vpickve2gr_h:
++    replaceVPICKVE2GRResults<3>(N, Results, DAG, Subtarget,
++                                LoongArchISD::VPICK_SEXT_ELT);
++    break;
++  case Intrinsic::loongarch_lsx_vpickve2gr_w:
++    replaceVPICKVE2GRResults<2>(N, Results, DAG, Subtarget,
++                                LoongArchISD::VPICK_SEXT_ELT);
++    break;
++  case Intrinsic::loongarch_lsx_vpickve2gr_bu:
++    replaceVPICKVE2GRResults<4>(N, Results, DAG, Subtarget,
++                                LoongArchISD::VPICK_ZEXT_ELT);
++    break;
++  case Intrinsic::loongarch_lsx_vpickve2gr_hu:
++    replaceVPICKVE2GRResults<3>(N, Results, DAG, Subtarget,
++                                LoongArchISD::VPICK_ZEXT_ELT);
++    break;
++  case Intrinsic::loongarch_lsx_vpickve2gr_wu:
++    replaceVPICKVE2GRResults<2>(N, Results, DAG, Subtarget,
++                                LoongArchISD::VPICK_ZEXT_ELT);
++    break;
++  case Intrinsic::loongarch_lsx_bz_b:
++  case Intrinsic::loongarch_lsx_bz_h:
++  case Intrinsic::loongarch_lsx_bz_w:
++  case Intrinsic::loongarch_lsx_bz_d:
++    replaceVecCondBranchResults(N, Results, DAG, Subtarget,
++                                LoongArchISD::VALL_ZERO);
++    break;
++  case Intrinsic::loongarch_lsx_bz_v:
++    replaceVecCondBranchResults(N, Results, DAG, Subtarget,
++                                LoongArchISD::VANY_ZERO);
++    break;
++  case Intrinsic::loongarch_lsx_bnz_b:
++  case Intrinsic::loongarch_lsx_bnz_h:
++  case Intrinsic::loongarch_lsx_bnz_w:
++  case Intrinsic::loongarch_lsx_bnz_d:
++    replaceVecCondBranchResults(N, Results, DAG, Subtarget,
++                                LoongArchISD::VALL_NONZERO);
++    break;
++  case Intrinsic::loongarch_lsx_bnz_v:
++    replaceVecCondBranchResults(N, Results, DAG, Subtarget,
++                                LoongArchISD::VANY_NONZERO);
++    break;
++  }
++}
++
+ void LoongArchTargetLowering::ReplaceNodeResults(
+     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+   SDLoc DL(N);
+@@ -1168,14 +1480,12 @@ void LoongArchTargetLowering::ReplaceNodeResults(
+       llvm_unreachable("Unexpected Intrinsic.");
+     case Intrinsic::loongarch_movfcsr2gr: {
+       if (!Subtarget.hasBasicF()) {
+-        emitErrorAndReplaceIntrinsicWithChainResults(N, Results, DAG,
+-                                                     ErrorMsgReqF);
++        emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgReqF);
+         return;
+       }
+       unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
+       if (!isUInt<2>(Imm)) {
+-        emitErrorAndReplaceIntrinsicWithChainResults(N, Results, DAG,
+-                                                     ErrorMsgOOR);
++        emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgOOR);
+         return;
+       }
+       SDValue MOVFCSR2GRResults = DAG.getNode(
+@@ -1211,7 +1521,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
+         {Chain, Op2,                                                           \
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(3))});       \
+     Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, NODE.getValue(0)));   \
+-    Results.push_back(NODE.getValue(1));                                                  \
++    Results.push_back(NODE.getValue(1));                                       \
+     break;                                                                     \
+   }
+       CRC_CASE_EXT_UNARYOP(crc_w_d_w, CRC_W_D_W)
+@@ -1220,8 +1530,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
+ #define CSR_CASE(ID)                                                           \
+   case Intrinsic::loongarch_##ID: {                                            \
+     if (!Subtarget.is64Bit())                                                  \
+-      emitErrorAndReplaceIntrinsicWithChainResults(N, Results, DAG,            \
+-                                                   ErrorMsgReqLA64);           \
++      emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgReqLA64);   \
+     break;                                                                     \
+   }
+       CSR_CASE(csrrd_d);
+@@ -1232,8 +1541,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
+     case Intrinsic::loongarch_csrrd_w: {
+       unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
+       if (!isUInt<14>(Imm)) {
+-        emitErrorAndReplaceIntrinsicWithChainResults(N, Results, DAG,
+-                                                     ErrorMsgOOR);
++        emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgOOR);
+         return;
+       }
+       SDValue CSRRDResults =
+@@ -1247,8 +1555,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
+     case Intrinsic::loongarch_csrwr_w: {
+       unsigned Imm = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+       if (!isUInt<14>(Imm)) {
+-        emitErrorAndReplaceIntrinsicWithChainResults(N, Results, DAG,
+-                                                     ErrorMsgOOR);
++        emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgOOR);
+         return;
+       }
+       SDValue CSRWRResults =
+@@ -1263,8 +1570,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
+     case Intrinsic::loongarch_csrxchg_w: {
+       unsigned Imm = cast<ConstantSDNode>(N->getOperand(4))->getZExtValue();
+       if (!isUInt<14>(Imm)) {
+-        emitErrorAndReplaceIntrinsicWithChainResults(N, Results, DAG,
+-                                                     ErrorMsgOOR);
++        emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgOOR);
+         return;
+       }
+       SDValue CSRXCHGResults = DAG.getNode(
+@@ -1302,8 +1608,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(
+     }
+     case Intrinsic::loongarch_lddir_d: {
+       if (!Subtarget.is64Bit()) {
+-        emitErrorAndReplaceIntrinsicWithChainResults(N, Results, DAG,
+-                                                     ErrorMsgReqLA64);
++        emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgReqLA64);
+         return;
+       }
+       break;
+@@ -1322,6 +1627,10 @@ void LoongArchTargetLowering::ReplaceNodeResults(
+     Results.push_back(N->getOperand(0));
+     break;
+   }
++  case ISD::INTRINSIC_WO_CHAIN: {
++    replaceINTRINSIC_WO_CHAINResults(N, Results, DAG, Subtarget);
++    break;
++  }
+   }
+ }
+ 
+@@ -1685,6 +1994,440 @@ static SDValue performBITREV_WCombine(SDNode *N, SelectionDAG &DAG,
+                      Src.getOperand(0));
+ }
+ 
++template <unsigned N>
++static SDValue legalizeIntrinsicImmArg(SDNode *Node, unsigned ImmOp,
++                                       SelectionDAG &DAG,
++                                       const LoongArchSubtarget &Subtarget,
++                                       bool IsSigned = false) {
++  SDLoc DL(Node);
++  auto *CImm = cast<ConstantSDNode>(Node->getOperand(ImmOp));
++  // Check the ImmArg.
++  if ((IsSigned && !isInt<N>(CImm->getSExtValue())) ||
++      (!IsSigned && !isUInt<N>(CImm->getZExtValue()))) {
++    DAG.getContext()->emitError(Node->getOperationName(0) +
++                                ": argument out of range.");
++    return DAG.getNode(ISD::UNDEF, DL, Subtarget.getGRLenVT());
++  }
++  return DAG.getConstant(CImm->getZExtValue(), DL, Subtarget.getGRLenVT());
++}
++
++template <unsigned N>
++static SDValue lowerVectorSplatImm(SDNode *Node, unsigned ImmOp,
++                                   SelectionDAG &DAG, bool IsSigned = false) {
++  SDLoc DL(Node);
++  EVT ResTy = Node->getValueType(0);
++  auto *CImm = cast<ConstantSDNode>(Node->getOperand(ImmOp));
++
++  // Check the ImmArg.
++  if ((IsSigned && !isInt<N>(CImm->getSExtValue())) ||
++      (!IsSigned && !isUInt<N>(CImm->getZExtValue()))) {
++    DAG.getContext()->emitError(Node->getOperationName(0) +
++                                ": argument out of range.");
++    return DAG.getNode(ISD::UNDEF, DL, ResTy);
++  }
++  return DAG.getConstant(
++      APInt(ResTy.getScalarType().getSizeInBits(),
++            IsSigned ? CImm->getSExtValue() : CImm->getZExtValue(), IsSigned),
++      DL, ResTy);
++}
++
++static SDValue truncateVecElts(SDNode *Node, SelectionDAG &DAG) {
++  SDLoc DL(Node);
++  EVT ResTy = Node->getValueType(0);
++  SDValue Vec = Node->getOperand(2);
++  SDValue Mask = DAG.getConstant(Vec.getScalarValueSizeInBits() - 1, DL, ResTy);
++  return DAG.getNode(ISD::AND, DL, ResTy, Vec, Mask);
++}
++
++static SDValue lowerVectorBitClear(SDNode *Node, SelectionDAG &DAG) {
++  SDLoc DL(Node);
++  EVT ResTy = Node->getValueType(0);
++  SDValue One = DAG.getConstant(1, DL, ResTy);
++  SDValue Bit =
++      DAG.getNode(ISD::SHL, DL, ResTy, One, truncateVecElts(Node, DAG));
++
++  return DAG.getNode(ISD::AND, DL, ResTy, Node->getOperand(1),
++                     DAG.getNOT(DL, Bit, ResTy));
++}
++
++template <unsigned N>
++static SDValue lowerVectorBitClearImm(SDNode *Node, SelectionDAG &DAG) {
++  SDLoc DL(Node);
++  EVT ResTy = Node->getValueType(0);
++  auto *CImm = cast<ConstantSDNode>(Node->getOperand(2));
++  // Check the unsigned ImmArg.
++  if (!isUInt<N>(CImm->getZExtValue())) {
++    DAG.getContext()->emitError(Node->getOperationName(0) +
++                                ": argument out of range.");
++    return DAG.getNode(ISD::UNDEF, DL, ResTy);
++  }
++
++  APInt BitImm = APInt(ResTy.getScalarSizeInBits(), 1) << CImm->getAPIntValue();
++  SDValue Mask = DAG.getConstant(~BitImm, DL, ResTy);
++
++  return DAG.getNode(ISD::AND, DL, ResTy, Node->getOperand(1), Mask);
++}
++
++template <unsigned N>
++static SDValue lowerVectorBitSetImm(SDNode *Node, SelectionDAG &DAG) {
++  SDLoc DL(Node);
++  EVT ResTy = Node->getValueType(0);
++  auto *CImm = cast<ConstantSDNode>(Node->getOperand(2));
++  // Check the unsigned ImmArg.
++  if (!isUInt<N>(CImm->getZExtValue())) {
++    DAG.getContext()->emitError(Node->getOperationName(0) +
++                                ": argument out of range.");
++    return DAG.getNode(ISD::UNDEF, DL, ResTy);
++  }
++
++  APInt Imm = APInt(ResTy.getScalarSizeInBits(), 1) << CImm->getAPIntValue();
++  SDValue BitImm = DAG.getConstant(Imm, DL, ResTy);
++  return DAG.getNode(ISD::OR, DL, ResTy, Node->getOperand(1), BitImm);
++}
++
++template <unsigned N>
++static SDValue lowerVectorBitRevImm(SDNode *Node, SelectionDAG &DAG) {
++  SDLoc DL(Node);
++  EVT ResTy = Node->getValueType(0);
++  auto *CImm = cast<ConstantSDNode>(Node->getOperand(2));
++  // Check the unsigned ImmArg.
++  if (!isUInt<N>(CImm->getZExtValue())) {
++    DAG.getContext()->emitError(Node->getOperationName(0) +
++                                ": argument out of range.");
++    return DAG.getNode(ISD::UNDEF, DL, ResTy);
++  }
++
++  APInt Imm = APInt(ResTy.getScalarSizeInBits(), 1) << CImm->getAPIntValue();
++  SDValue BitImm = DAG.getConstant(Imm, DL, ResTy);
++  return DAG.getNode(ISD::XOR, DL, ResTy, Node->getOperand(1), BitImm);
++}
++
++static SDValue
++performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
++                                 TargetLowering::DAGCombinerInfo &DCI,
++                                 const LoongArchSubtarget &Subtarget) {
++  SDLoc DL(N);
++  switch (N->getConstantOperandVal(0)) {
++  default:
++    break;
++  case Intrinsic::loongarch_lsx_vadd_b:
++  case Intrinsic::loongarch_lsx_vadd_h:
++  case Intrinsic::loongarch_lsx_vadd_w:
++  case Intrinsic::loongarch_lsx_vadd_d:
++    return DAG.getNode(ISD::ADD, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vaddi_bu:
++  case Intrinsic::loongarch_lsx_vaddi_hu:
++  case Intrinsic::loongarch_lsx_vaddi_wu:
++  case Intrinsic::loongarch_lsx_vaddi_du:
++    return DAG.getNode(ISD::ADD, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<5>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vsub_b:
++  case Intrinsic::loongarch_lsx_vsub_h:
++  case Intrinsic::loongarch_lsx_vsub_w:
++  case Intrinsic::loongarch_lsx_vsub_d:
++    return DAG.getNode(ISD::SUB, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vsubi_bu:
++  case Intrinsic::loongarch_lsx_vsubi_hu:
++  case Intrinsic::loongarch_lsx_vsubi_wu:
++  case Intrinsic::loongarch_lsx_vsubi_du:
++    return DAG.getNode(ISD::SUB, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<5>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vneg_b:
++  case Intrinsic::loongarch_lsx_vneg_h:
++  case Intrinsic::loongarch_lsx_vneg_w:
++  case Intrinsic::loongarch_lsx_vneg_d:
++    return DAG.getNode(
++        ISD::SUB, DL, N->getValueType(0),
++        DAG.getConstant(
++            APInt(N->getValueType(0).getScalarType().getSizeInBits(), 0,
++                  /*isSigned=*/true),
++            SDLoc(N), N->getValueType(0)),
++        N->getOperand(1));
++  case Intrinsic::loongarch_lsx_vmax_b:
++  case Intrinsic::loongarch_lsx_vmax_h:
++  case Intrinsic::loongarch_lsx_vmax_w:
++  case Intrinsic::loongarch_lsx_vmax_d:
++    return DAG.getNode(ISD::SMAX, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vmax_bu:
++  case Intrinsic::loongarch_lsx_vmax_hu:
++  case Intrinsic::loongarch_lsx_vmax_wu:
++  case Intrinsic::loongarch_lsx_vmax_du:
++    return DAG.getNode(ISD::UMAX, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vmaxi_b:
++  case Intrinsic::loongarch_lsx_vmaxi_h:
++  case Intrinsic::loongarch_lsx_vmaxi_w:
++  case Intrinsic::loongarch_lsx_vmaxi_d:
++    return DAG.getNode(ISD::SMAX, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<5>(N, 2, DAG, /*IsSigned=*/true));
++  case Intrinsic::loongarch_lsx_vmaxi_bu:
++  case Intrinsic::loongarch_lsx_vmaxi_hu:
++  case Intrinsic::loongarch_lsx_vmaxi_wu:
++  case Intrinsic::loongarch_lsx_vmaxi_du:
++    return DAG.getNode(ISD::UMAX, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<5>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vmin_b:
++  case Intrinsic::loongarch_lsx_vmin_h:
++  case Intrinsic::loongarch_lsx_vmin_w:
++  case Intrinsic::loongarch_lsx_vmin_d:
++    return DAG.getNode(ISD::SMIN, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vmin_bu:
++  case Intrinsic::loongarch_lsx_vmin_hu:
++  case Intrinsic::loongarch_lsx_vmin_wu:
++  case Intrinsic::loongarch_lsx_vmin_du:
++    return DAG.getNode(ISD::UMIN, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vmini_b:
++  case Intrinsic::loongarch_lsx_vmini_h:
++  case Intrinsic::loongarch_lsx_vmini_w:
++  case Intrinsic::loongarch_lsx_vmini_d:
++    return DAG.getNode(ISD::SMIN, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<5>(N, 2, DAG, /*IsSigned=*/true));
++  case Intrinsic::loongarch_lsx_vmini_bu:
++  case Intrinsic::loongarch_lsx_vmini_hu:
++  case Intrinsic::loongarch_lsx_vmini_wu:
++  case Intrinsic::loongarch_lsx_vmini_du:
++    return DAG.getNode(ISD::UMIN, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<5>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vmul_b:
++  case Intrinsic::loongarch_lsx_vmul_h:
++  case Intrinsic::loongarch_lsx_vmul_w:
++  case Intrinsic::loongarch_lsx_vmul_d:
++    return DAG.getNode(ISD::MUL, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vmadd_b:
++  case Intrinsic::loongarch_lsx_vmadd_h:
++  case Intrinsic::loongarch_lsx_vmadd_w:
++  case Intrinsic::loongarch_lsx_vmadd_d: {
++    EVT ResTy = N->getValueType(0);
++    return DAG.getNode(ISD::ADD, SDLoc(N), ResTy, N->getOperand(1),
++                       DAG.getNode(ISD::MUL, SDLoc(N), ResTy, N->getOperand(2),
++                                   N->getOperand(3)));
++  }
++  case Intrinsic::loongarch_lsx_vmsub_b:
++  case Intrinsic::loongarch_lsx_vmsub_h:
++  case Intrinsic::loongarch_lsx_vmsub_w:
++  case Intrinsic::loongarch_lsx_vmsub_d: {
++    EVT ResTy = N->getValueType(0);
++    return DAG.getNode(ISD::SUB, SDLoc(N), ResTy, N->getOperand(1),
++                       DAG.getNode(ISD::MUL, SDLoc(N), ResTy, N->getOperand(2),
++                                   N->getOperand(3)));
++  }
++  case Intrinsic::loongarch_lsx_vdiv_b:
++  case Intrinsic::loongarch_lsx_vdiv_h:
++  case Intrinsic::loongarch_lsx_vdiv_w:
++  case Intrinsic::loongarch_lsx_vdiv_d:
++    return DAG.getNode(ISD::SDIV, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vdiv_bu:
++  case Intrinsic::loongarch_lsx_vdiv_hu:
++  case Intrinsic::loongarch_lsx_vdiv_wu:
++  case Intrinsic::loongarch_lsx_vdiv_du:
++    return DAG.getNode(ISD::UDIV, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vmod_b:
++  case Intrinsic::loongarch_lsx_vmod_h:
++  case Intrinsic::loongarch_lsx_vmod_w:
++  case Intrinsic::loongarch_lsx_vmod_d:
++    return DAG.getNode(ISD::SREM, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vmod_bu:
++  case Intrinsic::loongarch_lsx_vmod_hu:
++  case Intrinsic::loongarch_lsx_vmod_wu:
++  case Intrinsic::loongarch_lsx_vmod_du:
++    return DAG.getNode(ISD::UREM, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vand_v:
++    return DAG.getNode(ISD::AND, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vor_v:
++    return DAG.getNode(ISD::OR, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vxor_v:
++    return DAG.getNode(ISD::XOR, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vnor_v: {
++    SDValue Res = DAG.getNode(ISD::OR, DL, N->getValueType(0), N->getOperand(1),
++                              N->getOperand(2));
++    return DAG.getNOT(DL, Res, Res->getValueType(0));
++  }
++  case Intrinsic::loongarch_lsx_vandi_b:
++    return DAG.getNode(ISD::AND, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<8>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vori_b:
++    return DAG.getNode(ISD::OR, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<8>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vxori_b:
++    return DAG.getNode(ISD::XOR, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<8>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vsll_b:
++  case Intrinsic::loongarch_lsx_vsll_h:
++  case Intrinsic::loongarch_lsx_vsll_w:
++  case Intrinsic::loongarch_lsx_vsll_d:
++    return DAG.getNode(ISD::SHL, DL, N->getValueType(0), N->getOperand(1),
++                       truncateVecElts(N, DAG));
++  case Intrinsic::loongarch_lsx_vslli_b:
++    return DAG.getNode(ISD::SHL, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<3>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vslli_h:
++    return DAG.getNode(ISD::SHL, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<4>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vslli_w:
++    return DAG.getNode(ISD::SHL, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<5>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vslli_d:
++    return DAG.getNode(ISD::SHL, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<6>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vsrl_b:
++  case Intrinsic::loongarch_lsx_vsrl_h:
++  case Intrinsic::loongarch_lsx_vsrl_w:
++  case Intrinsic::loongarch_lsx_vsrl_d:
++    return DAG.getNode(ISD::SRL, DL, N->getValueType(0), N->getOperand(1),
++                       truncateVecElts(N, DAG));
++  case Intrinsic::loongarch_lsx_vsrli_b:
++    return DAG.getNode(ISD::SRL, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<3>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vsrli_h:
++    return DAG.getNode(ISD::SRL, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<4>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vsrli_w:
++    return DAG.getNode(ISD::SRL, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<5>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vsrli_d:
++    return DAG.getNode(ISD::SRL, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<6>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vsra_b:
++  case Intrinsic::loongarch_lsx_vsra_h:
++  case Intrinsic::loongarch_lsx_vsra_w:
++  case Intrinsic::loongarch_lsx_vsra_d:
++    return DAG.getNode(ISD::SRA, DL, N->getValueType(0), N->getOperand(1),
++                       truncateVecElts(N, DAG));
++  case Intrinsic::loongarch_lsx_vsrai_b:
++    return DAG.getNode(ISD::SRA, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<3>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vsrai_h:
++    return DAG.getNode(ISD::SRA, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<4>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vsrai_w:
++    return DAG.getNode(ISD::SRA, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<5>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vsrai_d:
++    return DAG.getNode(ISD::SRA, DL, N->getValueType(0), N->getOperand(1),
++                       lowerVectorSplatImm<6>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vpcnt_b:
++  case Intrinsic::loongarch_lsx_vpcnt_h:
++  case Intrinsic::loongarch_lsx_vpcnt_w:
++  case Intrinsic::loongarch_lsx_vpcnt_d:
++    return DAG.getNode(ISD::CTPOP, DL, N->getValueType(0), N->getOperand(1));
++  case Intrinsic::loongarch_lsx_vbitclr_b:
++  case Intrinsic::loongarch_lsx_vbitclr_h:
++  case Intrinsic::loongarch_lsx_vbitclr_w:
++  case Intrinsic::loongarch_lsx_vbitclr_d:
++    return lowerVectorBitClear(N, DAG);
++  case Intrinsic::loongarch_lsx_vbitclri_b:
++    return lowerVectorBitClearImm<3>(N, DAG);
++  case Intrinsic::loongarch_lsx_vbitclri_h:
++    return lowerVectorBitClearImm<4>(N, DAG);
++  case Intrinsic::loongarch_lsx_vbitclri_w:
++    return lowerVectorBitClearImm<5>(N, DAG);
++  case Intrinsic::loongarch_lsx_vbitclri_d:
++    return lowerVectorBitClearImm<6>(N, DAG);
++  case Intrinsic::loongarch_lsx_vbitset_b:
++  case Intrinsic::loongarch_lsx_vbitset_h:
++  case Intrinsic::loongarch_lsx_vbitset_w:
++  case Intrinsic::loongarch_lsx_vbitset_d: {
++    EVT VecTy = N->getValueType(0);
++    SDValue One = DAG.getConstant(1, DL, VecTy);
++    return DAG.getNode(
++        ISD::OR, DL, VecTy, N->getOperand(1),
++        DAG.getNode(ISD::SHL, DL, VecTy, One, truncateVecElts(N, DAG)));
++  }
++  case Intrinsic::loongarch_lsx_vbitseti_b:
++    return lowerVectorBitSetImm<3>(N, DAG);
++  case Intrinsic::loongarch_lsx_vbitseti_h:
++    return lowerVectorBitSetImm<4>(N, DAG);
++  case Intrinsic::loongarch_lsx_vbitseti_w:
++    return lowerVectorBitSetImm<5>(N, DAG);
++  case Intrinsic::loongarch_lsx_vbitseti_d:
++    return lowerVectorBitSetImm<6>(N, DAG);
++  case Intrinsic::loongarch_lsx_vbitrev_b:
++  case Intrinsic::loongarch_lsx_vbitrev_h:
++  case Intrinsic::loongarch_lsx_vbitrev_w:
++  case Intrinsic::loongarch_lsx_vbitrev_d: {
++    EVT VecTy = N->getValueType(0);
++    SDValue One = DAG.getConstant(1, DL, VecTy);
++    return DAG.getNode(
++        ISD::XOR, DL, VecTy, N->getOperand(1),
++        DAG.getNode(ISD::SHL, DL, VecTy, One, truncateVecElts(N, DAG)));
++  }
++  case Intrinsic::loongarch_lsx_vbitrevi_b:
++    return lowerVectorBitRevImm<3>(N, DAG);
++  case Intrinsic::loongarch_lsx_vbitrevi_h:
++    return lowerVectorBitRevImm<4>(N, DAG);
++  case Intrinsic::loongarch_lsx_vbitrevi_w:
++    return lowerVectorBitRevImm<5>(N, DAG);
++  case Intrinsic::loongarch_lsx_vbitrevi_d:
++    return lowerVectorBitRevImm<6>(N, DAG);
++  case Intrinsic::loongarch_lsx_vfadd_s:
++  case Intrinsic::loongarch_lsx_vfadd_d:
++    return DAG.getNode(ISD::FADD, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vfsub_s:
++  case Intrinsic::loongarch_lsx_vfsub_d:
++    return DAG.getNode(ISD::FSUB, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vfmul_s:
++  case Intrinsic::loongarch_lsx_vfmul_d:
++    return DAG.getNode(ISD::FMUL, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vfdiv_s:
++  case Intrinsic::loongarch_lsx_vfdiv_d:
++    return DAG.getNode(ISD::FDIV, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2));
++  case Intrinsic::loongarch_lsx_vfmadd_s:
++  case Intrinsic::loongarch_lsx_vfmadd_d:
++    return DAG.getNode(ISD::FMA, DL, N->getValueType(0), N->getOperand(1),
++                       N->getOperand(2), N->getOperand(3));
++  case Intrinsic::loongarch_lsx_vinsgr2vr_b:
++    return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), N->getValueType(0),
++                       N->getOperand(1), N->getOperand(2),
++                       legalizeIntrinsicImmArg<4>(N, 3, DAG, Subtarget));
++  case Intrinsic::loongarch_lsx_vinsgr2vr_h:
++    return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), N->getValueType(0),
++                       N->getOperand(1), N->getOperand(2),
++                       legalizeIntrinsicImmArg<3>(N, 3, DAG, Subtarget));
++  case Intrinsic::loongarch_lsx_vinsgr2vr_w:
++    return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), N->getValueType(0),
++                       N->getOperand(1), N->getOperand(2),
++                       legalizeIntrinsicImmArg<2>(N, 3, DAG, Subtarget));
++  case Intrinsic::loongarch_lsx_vinsgr2vr_d:
++    return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), N->getValueType(0),
++                       N->getOperand(1), N->getOperand(2),
++                       legalizeIntrinsicImmArg<1>(N, 3, DAG, Subtarget));
++  case Intrinsic::loongarch_lsx_vreplgr2vr_b:
++  case Intrinsic::loongarch_lsx_vreplgr2vr_h:
++  case Intrinsic::loongarch_lsx_vreplgr2vr_w:
++  case Intrinsic::loongarch_lsx_vreplgr2vr_d: {
++    EVT ResTy = N->getValueType(0);
++    SmallVector<SDValue> Ops(ResTy.getVectorNumElements(), N->getOperand(1));
++    return DAG.getBuildVector(ResTy, DL, Ops);
++  }
++  case Intrinsic::loongarch_lsx_vreplve_b:
++  case Intrinsic::loongarch_lsx_vreplve_h:
++  case Intrinsic::loongarch_lsx_vreplve_w:
++  case Intrinsic::loongarch_lsx_vreplve_d:
++    return DAG.getNode(LoongArchISD::VREPLVE, DL, N->getValueType(0),
++                       N->getOperand(1),
++                       DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getGRLenVT(),
++                                   N->getOperand(2)));
++  }
++  return SDValue();
++}
++
+ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
+                                                    DAGCombinerInfo &DCI) const {
+   SelectionDAG &DAG = DCI.DAG;
+@@ -1699,6 +2442,8 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
+     return performSRLCombine(N, DAG, DCI, Subtarget);
+   case LoongArchISD::BITREV_W:
+     return performBITREV_WCombine(N, DAG, DCI, Subtarget);
++  case ISD::INTRINSIC_WO_CHAIN:
++    return performINTRINSIC_WO_CHAINCombine(N, DAG, DCI, Subtarget);
+   }
+   return SDValue();
+ }
+@@ -1752,6 +2497,101 @@ static MachineBasicBlock *insertDivByZeroTrap(MachineInstr &MI,
+   return SinkMBB;
+ }
+ 
++static MachineBasicBlock *
++emitVecCondBranchPseudo(MachineInstr &MI, MachineBasicBlock *BB,
++                        const LoongArchSubtarget &Subtarget) {
++  unsigned CondOpc;
++  switch (MI.getOpcode()) {
++  default:
++    llvm_unreachable("Unexpected opcode");
++  case LoongArch::PseudoVBZ:
++    CondOpc = LoongArch::VSETEQZ_V;
++    break;
++  case LoongArch::PseudoVBZ_B:
++    CondOpc = LoongArch::VSETANYEQZ_B;
++    break;
++  case LoongArch::PseudoVBZ_H:
++    CondOpc = LoongArch::VSETANYEQZ_H;
++    break;
++  case LoongArch::PseudoVBZ_W:
++    CondOpc = LoongArch::VSETANYEQZ_W;
++    break;
++  case LoongArch::PseudoVBZ_D:
++    CondOpc = LoongArch::VSETANYEQZ_D;
++    break;
++  case LoongArch::PseudoVBNZ:
++    CondOpc = LoongArch::VSETNEZ_V;
++    break;
++  case LoongArch::PseudoVBNZ_B:
++    CondOpc = LoongArch::VSETALLNEZ_B;
++    break;
++  case LoongArch::PseudoVBNZ_H:
++    CondOpc = LoongArch::VSETALLNEZ_H;
++    break;
++  case LoongArch::PseudoVBNZ_W:
++    CondOpc = LoongArch::VSETALLNEZ_W;
++    break;
++  case LoongArch::PseudoVBNZ_D:
++    CondOpc = LoongArch::VSETALLNEZ_D;
++    break;
++  }
++
++  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
++  const BasicBlock *LLVM_BB = BB->getBasicBlock();
++  DebugLoc DL = MI.getDebugLoc();
++  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
++  MachineFunction::iterator It = ++BB->getIterator();
++
++  MachineFunction *F = BB->getParent();
++  MachineBasicBlock *FalseBB = F->CreateMachineBasicBlock(LLVM_BB);
++  MachineBasicBlock *TrueBB = F->CreateMachineBasicBlock(LLVM_BB);
++  MachineBasicBlock *SinkBB = F->CreateMachineBasicBlock(LLVM_BB);
++
++  F->insert(It, FalseBB);
++  F->insert(It, TrueBB);
++  F->insert(It, SinkBB);
++
++  // Transfer the remainder of MBB and its successor edges to Sink.
++  SinkBB->splice(SinkBB->end(), BB, std::next(MI.getIterator()), BB->end());
++  SinkBB->transferSuccessorsAndUpdatePHIs(BB);
++
++  // Insert the real instruction to BB.
++  Register FCC = MRI.createVirtualRegister(&LoongArch::CFRRegClass);
++  BuildMI(BB, DL, TII->get(CondOpc), FCC).addReg(MI.getOperand(1).getReg());
++
++  // Insert branch.
++  BuildMI(BB, DL, TII->get(LoongArch::BCNEZ)).addReg(FCC).addMBB(TrueBB);
++  BB->addSuccessor(FalseBB);
++  BB->addSuccessor(TrueBB);
++
++  // FalseBB.
++  Register RD1 = MRI.createVirtualRegister(&LoongArch::GPRRegClass);
++  BuildMI(FalseBB, DL, TII->get(LoongArch::ADDI_W), RD1)
++      .addReg(LoongArch::R0)
++      .addImm(0);
++  BuildMI(FalseBB, DL, TII->get(LoongArch::PseudoBR)).addMBB(SinkBB);
++  FalseBB->addSuccessor(SinkBB);
++
++  // TrueBB.
++  Register RD2 = MRI.createVirtualRegister(&LoongArch::GPRRegClass);
++  BuildMI(TrueBB, DL, TII->get(LoongArch::ADDI_W), RD2)
++      .addReg(LoongArch::R0)
++      .addImm(1);
++  TrueBB->addSuccessor(SinkBB);
++
++  // SinkBB: merge the results.
++  BuildMI(*SinkBB, SinkBB->begin(), DL, TII->get(LoongArch::PHI),
++          MI.getOperand(0).getReg())
++      .addReg(RD1)
++      .addMBB(FalseBB)
++      .addReg(RD2)
++      .addMBB(TrueBB);
++
++  // The pseudo instruction is gone now.
++  MI.eraseFromParent();
++  return SinkBB;
++}
++
+ MachineBasicBlock *LoongArchTargetLowering::EmitInstrWithCustomInserter(
+     MachineInstr &MI, MachineBasicBlock *BB) const {
+   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+@@ -1786,6 +2626,17 @@ MachineBasicBlock *LoongArchTargetLowering::EmitInstrWithCustomInserter(
+     MI.eraseFromParent();
+     return BB;
+   }
++  case LoongArch::PseudoVBZ:
++  case LoongArch::PseudoVBZ_B:
++  case LoongArch::PseudoVBZ_H:
++  case LoongArch::PseudoVBZ_W:
++  case LoongArch::PseudoVBZ_D:
++  case LoongArch::PseudoVBNZ:
++  case LoongArch::PseudoVBNZ_B:
++  case LoongArch::PseudoVBNZ_H:
++  case LoongArch::PseudoVBNZ_W:
++  case LoongArch::PseudoVBNZ_D:
++    return emitVecCondBranchPseudo(MI, BB, Subtarget);
+   }
+ }
+ 
+@@ -1858,6 +2709,13 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
+     NODE_NAME_CASE(MOVFCSR2GR)
+     NODE_NAME_CASE(CACOP_D)
+     NODE_NAME_CASE(CACOP_W)
++    NODE_NAME_CASE(VPICK_SEXT_ELT)
++    NODE_NAME_CASE(VPICK_ZEXT_ELT)
++    NODE_NAME_CASE(VREPLVE)
++    NODE_NAME_CASE(VALL_ZERO)
++    NODE_NAME_CASE(VANY_ZERO)
++    NODE_NAME_CASE(VALL_NONZERO)
++    NODE_NAME_CASE(VANY_NONZERO)
+   }
+ #undef NODE_NAME_CASE
+   return nullptr;
+@@ -1884,6 +2742,10 @@ const MCPhysReg ArgFPR64s[] = {
+     LoongArch::F0_64, LoongArch::F1_64, LoongArch::F2_64, LoongArch::F3_64,
+     LoongArch::F4_64, LoongArch::F5_64, LoongArch::F6_64, LoongArch::F7_64};
+ 
++const MCPhysReg ArgVRs[] = {LoongArch::VR0, LoongArch::VR1, LoongArch::VR2,
++                            LoongArch::VR3, LoongArch::VR4, LoongArch::VR5,
++                            LoongArch::VR6, LoongArch::VR7};
++
+ // Pass a 2*GRLen argument that has been split into two GRLen values through
+ // registers or the stack as necessary.
+ static bool CC_LoongArchAssign2GRLen(unsigned GRLen, CCState &State,
+@@ -2030,6 +2892,8 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI,
+     Reg = State.AllocateReg(ArgFPR32s);
+   else if (ValVT == MVT::f64 && !UseGPRForFloat)
+     Reg = State.AllocateReg(ArgFPR64s);
++  else if (ValVT.is128BitVector())
++    Reg = State.AllocateReg(ArgVRs);
+   else
+     Reg = State.AllocateReg(ArgGPRs);
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+index 500407493fe5..7765057ebffb 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+@@ -110,6 +110,20 @@ enum NodeType : unsigned {
+ 
+   // Read CPU configuration information operation
+   CPUCFG,
++
++  // Vector Shuffle
++  VREPLVE,
++
++  // Extended vector element extraction
++  VPICK_SEXT_ELT,
++  VPICK_ZEXT_ELT,
++
++  // Vector comparisons
++  VALL_ZERO,
++  VANY_ZERO,
++  VALL_NONZERO,
++  VANY_NONZERO,
++
+   // Intrinsic operations end =============================================
+ };
+ } // end namespace LoongArchISD
+diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+index ef79b8a0dcd3..a5d66ebac96a 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+@@ -47,6 +47,14 @@ void LoongArchInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+     return;
+   }
+ 
++  // VR->VR copies.
++  if (LoongArch::LSX128RegClass.contains(DstReg, SrcReg)) {
++    BuildMI(MBB, MBBI, DL, get(LoongArch::VORI_B), DstReg)
++        .addReg(SrcReg, getKillRegState(KillSrc))
++        .addImm(0);
++    return;
++  }
++
+   // GPR->CFR copy.
+   if (LoongArch::CFRRegClass.contains(DstReg) &&
+       LoongArch::GPRRegClass.contains(SrcReg)) {
+@@ -99,6 +107,8 @@ void LoongArchInstrInfo::storeRegToStackSlot(
+     Opcode = LoongArch::FST_S;
+   else if (LoongArch::FPR64RegClass.hasSubClassEq(RC))
+     Opcode = LoongArch::FST_D;
++  else if (LoongArch::LSX128RegClass.hasSubClassEq(RC))
++    Opcode = LoongArch::VST;
+   else if (LoongArch::CFRRegClass.hasSubClassEq(RC))
+     Opcode = LoongArch::PseudoST_CFR;
+   else
+@@ -133,6 +143,8 @@ void LoongArchInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+     Opcode = LoongArch::FLD_S;
+   else if (LoongArch::FPR64RegClass.hasSubClassEq(RC))
+     Opcode = LoongArch::FLD_D;
++  else if (LoongArch::LSX128RegClass.hasSubClassEq(RC))
++    Opcode = LoongArch::VLD;
+   else if (LoongArch::CFRRegClass.hasSubClassEq(RC))
+     Opcode = LoongArch::PseudoLD_CFR;
+   else
+diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+index ac391ef471b1..b2c4bb812ba5 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+@@ -182,7 +182,7 @@ def imm32 : Operand<GRLenVT> {
+   let ParserMatchClass = ImmAsmOperand<"", 32, "">;
+ }
+ 
+-def uimm1 : Operand<GRLenVT> {
++def uimm1 : Operand<GRLenVT>, ImmLeaf<GRLenVT, [{return isUInt<1>(Imm);}]>{
+   let ParserMatchClass = UImmAsmOperand<1>;
+ }
+ 
+@@ -197,11 +197,11 @@ def uimm2_plus1 : Operand<GRLenVT>,
+   let DecoderMethod = "decodeUImmOperand<2, 1>";
+ }
+ 
+-def uimm3 : Operand<GRLenVT> {
++def uimm3 : Operand<GRLenVT>, ImmLeaf<GRLenVT, [{return isUInt<3>(Imm);}]> {
+   let ParserMatchClass = UImmAsmOperand<3>;
+ }
+ 
+-def uimm4 : Operand<GRLenVT> {
++def uimm4 : Operand<GRLenVT>, ImmLeaf<GRLenVT, [{return isUInt<4>(Imm);}]> {
+   let ParserMatchClass = UImmAsmOperand<4>;
+ }
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index a8ed285a37cf..13332be0bc38 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -10,6 +10,146 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++def SDT_LoongArchVreplve : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
++                                         SDTCisInt<1>, SDTCisVec<1>,
++                                         SDTCisSameAs<0, 1>, SDTCisInt<2>]>;
++def SDT_LoongArchVecCond : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<1>]>;
++
++// Target nodes.
++def loongarch_vreplve : SDNode<"LoongArchISD::VREPLVE", SDT_LoongArchVreplve>;
++def loongarch_vall_nonzero : SDNode<"LoongArchISD::VALL_NONZERO",
++                                    SDT_LoongArchVecCond>;
++def loongarch_vany_nonzero : SDNode<"LoongArchISD::VANY_NONZERO",
++                                    SDT_LoongArchVecCond>;
++def loongarch_vall_zero : SDNode<"LoongArchISD::VALL_ZERO",
++                                SDT_LoongArchVecCond>;
++def loongarch_vany_zero : SDNode<"LoongArchISD::VANY_ZERO",
++                                SDT_LoongArchVecCond>;
++
++def loongarch_vpick_sext_elt : SDNode<"LoongArchISD::VPICK_SEXT_ELT",
++                                      SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>>;
++def loongarch_vpick_zext_elt : SDNode<"LoongArchISD::VPICK_ZEXT_ELT",
++                                      SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>>;
++
++class VecCond<SDPatternOperator OpNode, ValueType TyNode,
++              RegisterClass RC = LSX128>
++    : Pseudo<(outs GPR:$rd), (ins RC:$vj),
++             [(set GPR:$rd, (OpNode (TyNode RC:$vj)))]> {
++  let hasSideEffects = 0;
++  let mayLoad = 0;
++  let mayStore = 0;
++  let usesCustomInserter = 1;
++}
++
++def vsplat_imm_eq_1 : PatFrags<(ops), [(build_vector),
++                                       (bitconvert (v4i32 (build_vector)))], [{
++  APInt Imm;
++  EVT EltTy = N->getValueType(0).getVectorElementType();
++
++  if (N->getOpcode() == ISD::BITCAST)
++    N = N->getOperand(0).getNode();
++
++  return selectVSplat(N, Imm, EltTy.getSizeInBits()) &&
++         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
++}]>;
++
++def vsplati8_imm_eq_7 : PatFrags<(ops), [(build_vector)], [{
++  APInt Imm;
++  EVT EltTy = N->getValueType(0).getVectorElementType();
++
++  if (N->getOpcode() == ISD::BITCAST)
++    N = N->getOperand(0).getNode();
++
++  return selectVSplat(N, Imm, EltTy.getSizeInBits()) &&
++         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 7;
++}]>;
++def vsplati16_imm_eq_15 : PatFrags<(ops), [(build_vector)], [{
++  APInt Imm;
++  EVT EltTy = N->getValueType(0).getVectorElementType();
++
++  if (N->getOpcode() == ISD::BITCAST)
++    N = N->getOperand(0).getNode();
++
++  return selectVSplat(N, Imm, EltTy.getSizeInBits()) &&
++         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 15;
++}]>;
++def vsplati32_imm_eq_31 : PatFrags<(ops), [(build_vector)], [{
++  APInt Imm;
++  EVT EltTy = N->getValueType(0).getVectorElementType();
++
++  if (N->getOpcode() == ISD::BITCAST)
++    N = N->getOperand(0).getNode();
++
++  return selectVSplat(N, Imm, EltTy.getSizeInBits()) &&
++         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 31;
++}]>;
++def vsplati64_imm_eq_63 : PatFrags<(ops), [(build_vector),
++                                           (bitconvert (v4i32 (build_vector)))], [{
++  APInt Imm;
++  EVT EltTy = N->getValueType(0).getVectorElementType();
++
++  if (N->getOpcode() == ISD::BITCAST)
++    N = N->getOperand(0).getNode();
++
++  return selectVSplat(N, Imm, EltTy.getSizeInBits()) &&
++         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 63;
++}]>;
++
++def vsplati8imm7   : PatFrag<(ops node:$reg),
++                             (and node:$reg, vsplati8_imm_eq_7)>;
++def vsplati16imm15 : PatFrag<(ops node:$reg),
++                             (and node:$reg, vsplati16_imm_eq_15)>;
++def vsplati32imm31 : PatFrag<(ops node:$reg),
++                             (and node:$reg, vsplati32_imm_eq_31)>;
++def vsplati64imm63 : PatFrag<(ops node:$reg),
++                             (and node:$reg, vsplati64_imm_eq_63)>;
++
++foreach N = [3, 4, 5, 6, 8] in
++  def SplatPat_uimm#N : ComplexPattern<vAny, 1, "selectVSplatImm<"#N#">",
++                                       [build_vector, bitconvert], [], 2>;
++
++foreach N = [5] in
++  def SplatPat_simm#N : ComplexPattern<vAny, 1, "selectVSplatImm<"#N#", true>",
++                                       [build_vector, bitconvert]>;
++
++def vsplat_uimm_inv_pow2 : ComplexPattern<vAny, 1, "selectVSplatUimmInvPow2",
++                                          [build_vector, bitconvert]>;
++
++def vsplat_uimm_pow2 : ComplexPattern<vAny, 1, "selectVSplatUimmPow2",
++                                      [build_vector, bitconvert]>;
++
++def muladd : PatFrag<(ops node:$vd, node:$vj, node:$vk),
++                     (add node:$vd, (mul node:$vj, node:$vk))>;
++
++def mulsub : PatFrag<(ops node:$vd, node:$vj, node:$vk),
++                     (sub node:$vd, (mul node:$vj, node:$vk))>;
++
++def lsxsplati8  : PatFrag<(ops node:$e0),
++                          (v16i8 (build_vector node:$e0, node:$e0,
++                                               node:$e0, node:$e0,
++                                               node:$e0, node:$e0,
++                                               node:$e0, node:$e0,
++                                               node:$e0, node:$e0,
++                                               node:$e0, node:$e0,
++                                               node:$e0, node:$e0,
++                                               node:$e0, node:$e0))>;
++def lsxsplati16 : PatFrag<(ops node:$e0),
++                          (v8i16 (build_vector node:$e0, node:$e0,
++                                               node:$e0, node:$e0,
++                                               node:$e0, node:$e0,
++                                               node:$e0, node:$e0))>;
++def lsxsplati32 : PatFrag<(ops node:$e0),
++                          (v4i32 (build_vector node:$e0, node:$e0,
++                                               node:$e0, node:$e0))>;
++
++def lsxsplati64 : PatFrag<(ops node:$e0),
++                          (v2i64 (build_vector node:$e0, node:$e0))>;
++
++def to_valide_timm : SDNodeXForm<timm, [{
++  auto CN = cast<ConstantSDNode>(N);
++  return CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(N), Subtarget->getGRLenVT());
++}]>;
++
+ //===----------------------------------------------------------------------===//
+ // Instruction class templates
+ //===----------------------------------------------------------------------===//
+@@ -1004,4 +1144,680 @@ def PseudoVREPLI_D : Pseudo<(outs LSX128:$vd), (ins simm10:$imm), [],
+                             "vrepli.d", "$vd, $imm">;
+ }
+ 
++def PseudoVBNZ_B : VecCond<loongarch_vall_nonzero, v16i8>;
++def PseudoVBNZ_H : VecCond<loongarch_vall_nonzero, v8i16>;
++def PseudoVBNZ_W : VecCond<loongarch_vall_nonzero, v4i32>;
++def PseudoVBNZ_D : VecCond<loongarch_vall_nonzero, v2i64>;
++def PseudoVBNZ : VecCond<loongarch_vany_nonzero, v16i8>;
++
++def PseudoVBZ_B : VecCond<loongarch_vall_zero, v16i8>;
++def PseudoVBZ_H : VecCond<loongarch_vall_zero, v8i16>;
++def PseudoVBZ_W : VecCond<loongarch_vall_zero, v4i32>;
++def PseudoVBZ_D : VecCond<loongarch_vall_zero, v2i64>;
++def PseudoVBZ : VecCond<loongarch_vany_zero, v16i8>;
++
++} // Predicates = [HasExtLSX]
++
++multiclass PatVr<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(v16i8 (OpNode (v16i8 LSX128:$vj))),
++            (!cast<LAInst>(Inst#"_B") LSX128:$vj)>;
++  def : Pat<(v8i16 (OpNode (v8i16 LSX128:$vj))),
++            (!cast<LAInst>(Inst#"_H") LSX128:$vj)>;
++  def : Pat<(v4i32 (OpNode (v4i32 LSX128:$vj))),
++            (!cast<LAInst>(Inst#"_W") LSX128:$vj)>;
++  def : Pat<(v2i64 (OpNode (v2i64 LSX128:$vj))),
++            (!cast<LAInst>(Inst#"_D") LSX128:$vj)>;
++}
++
++multiclass PatVrVr<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(OpNode (v16i8 LSX128:$vj), (v16i8 LSX128:$vk)),
++            (!cast<LAInst>(Inst#"_B") LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(OpNode (v8i16 LSX128:$vj), (v8i16 LSX128:$vk)),
++            (!cast<LAInst>(Inst#"_H") LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(OpNode (v4i32 LSX128:$vj), (v4i32 LSX128:$vk)),
++            (!cast<LAInst>(Inst#"_W") LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(OpNode (v2i64 LSX128:$vj), (v2i64 LSX128:$vk)),
++            (!cast<LAInst>(Inst#"_D") LSX128:$vj, LSX128:$vk)>;
++}
++
++multiclass PatVrVrF<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(OpNode (v4f32 LSX128:$vj), (v4f32 LSX128:$vk)),
++            (!cast<LAInst>(Inst#"_S") LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(OpNode (v2f64 LSX128:$vj), (v2f64 LSX128:$vk)),
++            (!cast<LAInst>(Inst#"_D") LSX128:$vj, LSX128:$vk)>;
++}
++
++multiclass PatVrVrU<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(OpNode (v16i8 LSX128:$vj), (v16i8 LSX128:$vk)),
++            (!cast<LAInst>(Inst#"_BU") LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(OpNode (v8i16 LSX128:$vj), (v8i16 LSX128:$vk)),
++            (!cast<LAInst>(Inst#"_HU") LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(OpNode (v4i32 LSX128:$vj), (v4i32 LSX128:$vk)),
++            (!cast<LAInst>(Inst#"_WU") LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(OpNode (v2i64 LSX128:$vj), (v2i64 LSX128:$vk)),
++            (!cast<LAInst>(Inst#"_DU") LSX128:$vj, LSX128:$vk)>;
++}
++
++multiclass PatVrSimm5<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(OpNode (v16i8 LSX128:$vj), (v16i8 (SplatPat_simm5 simm5:$imm))),
++            (!cast<LAInst>(Inst#"_B") LSX128:$vj, simm5:$imm)>;
++  def : Pat<(OpNode (v8i16 LSX128:$vj), (v8i16 (SplatPat_simm5 simm5:$imm))),
++            (!cast<LAInst>(Inst#"_H") LSX128:$vj, simm5:$imm)>;
++  def : Pat<(OpNode (v4i32 LSX128:$vj), (v4i32 (SplatPat_simm5 simm5:$imm))),
++            (!cast<LAInst>(Inst#"_W") LSX128:$vj, simm5:$imm)>;
++  def : Pat<(OpNode (v2i64 LSX128:$vj), (v2i64 (SplatPat_simm5 simm5:$imm))),
++            (!cast<LAInst>(Inst#"_D") LSX128:$vj, simm5:$imm)>;
++}
++
++multiclass PatVrUimm5<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(OpNode (v16i8 LSX128:$vj), (v16i8 (SplatPat_uimm5 uimm5:$imm))),
++            (!cast<LAInst>(Inst#"_BU") LSX128:$vj, uimm5:$imm)>;
++  def : Pat<(OpNode (v8i16 LSX128:$vj), (v8i16 (SplatPat_uimm5 uimm5:$imm))),
++            (!cast<LAInst>(Inst#"_HU") LSX128:$vj, uimm5:$imm)>;
++  def : Pat<(OpNode (v4i32 LSX128:$vj), (v4i32 (SplatPat_uimm5 uimm5:$imm))),
++            (!cast<LAInst>(Inst#"_WU") LSX128:$vj, uimm5:$imm)>;
++  def : Pat<(OpNode (v2i64 LSX128:$vj), (v2i64 (SplatPat_uimm5 uimm5:$imm))),
++            (!cast<LAInst>(Inst#"_DU") LSX128:$vj, uimm5:$imm)>;
++}
++
++multiclass PatVrVrVr<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(OpNode (v16i8 LSX128:$vd), (v16i8 LSX128:$vj), (v16i8 LSX128:$vk)),
++            (!cast<LAInst>(Inst#"_B") LSX128:$vd, LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(OpNode (v8i16 LSX128:$vd), (v8i16 LSX128:$vj), (v8i16 LSX128:$vk)),
++            (!cast<LAInst>(Inst#"_H") LSX128:$vd, LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(OpNode (v4i32 LSX128:$vd), (v4i32 LSX128:$vj), (v4i32 LSX128:$vk)),
++            (!cast<LAInst>(Inst#"_W") LSX128:$vd, LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(OpNode (v2i64 LSX128:$vd), (v2i64 LSX128:$vj), (v2i64 LSX128:$vk)),
++            (!cast<LAInst>(Inst#"_D") LSX128:$vd, LSX128:$vj, LSX128:$vk)>;
++}
++
++multiclass PatShiftVrVr<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(OpNode (v16i8 LSX128:$vj), (and vsplati8_imm_eq_7,
++                                             (v16i8 LSX128:$vk))),
++            (!cast<LAInst>(Inst#"_B") LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(OpNode (v8i16 LSX128:$vj), (and vsplati16_imm_eq_15,
++                                             (v8i16 LSX128:$vk))),
++            (!cast<LAInst>(Inst#"_H") LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(OpNode (v4i32 LSX128:$vj), (and vsplati32_imm_eq_31,
++                                             (v4i32 LSX128:$vk))),
++            (!cast<LAInst>(Inst#"_W") LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(OpNode (v2i64 LSX128:$vj), (and vsplati64_imm_eq_63,
++                                             (v2i64 LSX128:$vk))),
++            (!cast<LAInst>(Inst#"_D") LSX128:$vj, LSX128:$vk)>;
++}
++
++multiclass PatShiftVrUimm<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(OpNode (v16i8 LSX128:$vj), (v16i8 (SplatPat_uimm3 uimm3:$imm))),
++            (!cast<LAInst>(Inst#"_B") LSX128:$vj, uimm3:$imm)>;
++  def : Pat<(OpNode (v8i16 LSX128:$vj), (v8i16 (SplatPat_uimm4 uimm4:$imm))),
++            (!cast<LAInst>(Inst#"_H") LSX128:$vj, uimm4:$imm)>;
++  def : Pat<(OpNode (v4i32 LSX128:$vj), (v4i32 (SplatPat_uimm5 uimm5:$imm))),
++            (!cast<LAInst>(Inst#"_W") LSX128:$vj, uimm5:$imm)>;
++  def : Pat<(OpNode (v2i64 LSX128:$vj), (v2i64 (SplatPat_uimm6 uimm6:$imm))),
++            (!cast<LAInst>(Inst#"_D") LSX128:$vj, uimm6:$imm)>;
++}
++
++class PatVrVrB<SDPatternOperator OpNode, LAInst Inst>
++    : Pat<(OpNode (v16i8 LSX128:$vj), (v16i8 LSX128:$vk)),
++          (Inst LSX128:$vj, LSX128:$vk)>;
++
++let Predicates = [HasExtLSX] in {
++
++// VADD_{B/H/W/D}
++defm : PatVrVr<add, "VADD">;
++// VSUB_{B/H/W/D}
++defm : PatVrVr<sub, "VSUB">;
++
++// VADDI_{B/H/W/D}U
++defm : PatVrUimm5<add, "VADDI">;
++// VSUBI_{B/H/W/D}U
++defm : PatVrUimm5<sub, "VSUBI">;
++
++// VNEG_{B/H/W/D}
++def : Pat<(sub immAllZerosV, (v16i8 LSX128:$vj)), (VNEG_B LSX128:$vj)>;
++def : Pat<(sub immAllZerosV, (v8i16 LSX128:$vj)), (VNEG_H LSX128:$vj)>;
++def : Pat<(sub immAllZerosV, (v4i32 LSX128:$vj)), (VNEG_W LSX128:$vj)>;
++def : Pat<(sub immAllZerosV, (v2i64 LSX128:$vj)), (VNEG_D LSX128:$vj)>;
++
++// VMAX[I]_{B/H/W/D}[U]
++defm : PatVrVr<smax, "VMAX">;
++defm : PatVrVrU<umax, "VMAX">;
++defm : PatVrSimm5<smax, "VMAXI">;
++defm : PatVrUimm5<umax, "VMAXI">;
++
++// VMIN[I]_{B/H/W/D}[U]
++defm : PatVrVr<smin, "VMIN">;
++defm : PatVrVrU<umin, "VMIN">;
++defm : PatVrSimm5<smin, "VMINI">;
++defm : PatVrUimm5<umin, "VMINI">;
++
++// VMUL_{B/H/W/D}
++defm : PatVrVr<mul, "VMUL">;
++
++// VMADD_{B/H/W/D}
++defm : PatVrVrVr<muladd, "VMADD">;
++// VMSUB_{B/H/W/D}
++defm : PatVrVrVr<mulsub, "VMSUB">;
++
++// VDIV_{B/H/W/D}[U]
++defm : PatVrVr<sdiv, "VDIV">;
++defm : PatVrVrU<udiv, "VDIV">;
++
++// VMOD_{B/H/W/D}[U]
++defm : PatVrVr<srem, "VMOD">;
++defm : PatVrVrU<urem, "VMOD">;
++
++// VAND_V
++def : PatVrVrB<and, VAND_V>;
++// VNOR_V
++def : PatVrVrB<or, VOR_V>;
++// VXOR_V
++def : PatVrVrB<xor, VXOR_V>;
++// VNOR_V
++def : Pat<(vnot (or (v16i8 LSX128:$vj), (v16i8 LSX128:$vk))),
++          (VNOR_V LSX128:$vj, LSX128:$vk)>;
++
++// VANDI_B
++def : Pat<(and (v16i8 LSX128:$vj), (v16i8 (SplatPat_uimm8 uimm8:$imm))),
++          (VANDI_B LSX128:$vj, uimm8:$imm)>;
++// VORI_B
++def : Pat<(or (v16i8 LSX128:$vj), (v16i8 (SplatPat_uimm8 uimm8:$imm))),
++          (VORI_B LSX128:$vj, uimm8:$imm)>;
++
++// VXORI_B
++def : Pat<(xor (v16i8 LSX128:$vj), (v16i8 (SplatPat_uimm8 uimm8:$imm))),
++          (VXORI_B LSX128:$vj, uimm8:$imm)>;
++
++// VSLL[I]_{B/H/W/D}
++defm : PatVrVr<shl, "VSLL">;
++defm : PatShiftVrVr<shl, "VSLL">;
++defm : PatShiftVrUimm<shl, "VSLLI">;
++
++// VSRL[I]_{B/H/W/D}
++defm : PatVrVr<srl, "VSRL">;
++defm : PatShiftVrVr<srl, "VSRL">;
++defm : PatShiftVrUimm<srl, "VSRLI">;
++
++// VSRA[I]_{B/H/W/D}
++defm : PatVrVr<sra, "VSRA">;
++defm : PatShiftVrVr<sra, "VSRA">;
++defm : PatShiftVrUimm<sra, "VSRAI">;
++
++// VPCNT_{B/H/W/D}
++defm : PatVr<ctpop, "VPCNT">;
++
++// VBITCLR_{B/H/W/D}
++def : Pat<(and v16i8:$vj, (vnot (shl vsplat_imm_eq_1, v16i8:$vk))),
++          (v16i8 (VBITCLR_B v16i8:$vj, v16i8:$vk))>;
++def : Pat<(and v8i16:$vj, (vnot (shl vsplat_imm_eq_1, v8i16:$vk))),
++          (v8i16 (VBITCLR_H v8i16:$vj, v8i16:$vk))>;
++def : Pat<(and v4i32:$vj, (vnot (shl vsplat_imm_eq_1, v4i32:$vk))),
++          (v4i32 (VBITCLR_W v4i32:$vj, v4i32:$vk))>;
++def : Pat<(and v2i64:$vj, (vnot (shl vsplat_imm_eq_1, v2i64:$vk))),
++          (v2i64 (VBITCLR_D v2i64:$vj, v2i64:$vk))>;
++def : Pat<(and v16i8:$vj, (vnot (shl vsplat_imm_eq_1,
++                                     (vsplati8imm7 v16i8:$vk)))),
++          (v16i8 (VBITCLR_B v16i8:$vj, v16i8:$vk))>;
++def : Pat<(and v8i16:$vj, (vnot (shl vsplat_imm_eq_1,
++                                     (vsplati16imm15 v8i16:$vk)))),
++          (v8i16 (VBITCLR_H v8i16:$vj, v8i16:$vk))>;
++def : Pat<(and v4i32:$vj, (vnot (shl vsplat_imm_eq_1,
++                                     (vsplati32imm31 v4i32:$vk)))),
++          (v4i32 (VBITCLR_W v4i32:$vj, v4i32:$vk))>;
++def : Pat<(and v2i64:$vj, (vnot (shl vsplat_imm_eq_1,
++                                     (vsplati64imm63 v2i64:$vk)))),
++          (v2i64 (VBITCLR_D v2i64:$vj, v2i64:$vk))>;
++
++// VBITCLRI_{B/H/W/D}
++def : Pat<(and (v16i8 LSX128:$vj), (v16i8 (vsplat_uimm_inv_pow2 uimm3:$imm))),
++          (VBITCLRI_B LSX128:$vj, uimm3:$imm)>;
++def : Pat<(and (v8i16 LSX128:$vj), (v8i16 (vsplat_uimm_inv_pow2 uimm4:$imm))),
++          (VBITCLRI_H LSX128:$vj, uimm4:$imm)>;
++def : Pat<(and (v4i32 LSX128:$vj), (v4i32 (vsplat_uimm_inv_pow2 uimm5:$imm))),
++          (VBITCLRI_W LSX128:$vj, uimm5:$imm)>;
++def : Pat<(and (v2i64 LSX128:$vj), (v2i64 (vsplat_uimm_inv_pow2 uimm6:$imm))),
++          (VBITCLRI_D LSX128:$vj, uimm6:$imm)>;
++
++// VBITSET_{B/H/W/D}
++def : Pat<(or v16i8:$vj, (shl vsplat_imm_eq_1, v16i8:$vk)),
++          (v16i8 (VBITSET_B v16i8:$vj, v16i8:$vk))>;
++def : Pat<(or v8i16:$vj, (shl vsplat_imm_eq_1, v8i16:$vk)),
++          (v8i16 (VBITSET_H v8i16:$vj, v8i16:$vk))>;
++def : Pat<(or v4i32:$vj, (shl vsplat_imm_eq_1, v4i32:$vk)),
++          (v4i32 (VBITSET_W v4i32:$vj, v4i32:$vk))>;
++def : Pat<(or v2i64:$vj, (shl vsplat_imm_eq_1, v2i64:$vk)),
++          (v2i64 (VBITSET_D v2i64:$vj, v2i64:$vk))>;
++def : Pat<(or v16i8:$vj, (shl vsplat_imm_eq_1, (vsplati8imm7 v16i8:$vk))),
++          (v16i8 (VBITSET_B v16i8:$vj, v16i8:$vk))>;
++def : Pat<(or v8i16:$vj, (shl vsplat_imm_eq_1, (vsplati16imm15 v8i16:$vk))),
++          (v8i16 (VBITSET_H v8i16:$vj, v8i16:$vk))>;
++def : Pat<(or v4i32:$vj, (shl vsplat_imm_eq_1, (vsplati32imm31 v4i32:$vk))),
++          (v4i32 (VBITSET_W v4i32:$vj, v4i32:$vk))>;
++def : Pat<(or v2i64:$vj, (shl vsplat_imm_eq_1, (vsplati64imm63 v2i64:$vk))),
++          (v2i64 (VBITSET_D v2i64:$vj, v2i64:$vk))>;
++
++// VBITSETI_{B/H/W/D}
++def : Pat<(or (v16i8 LSX128:$vj), (v16i8 (vsplat_uimm_pow2 uimm3:$imm))),
++          (VBITSETI_B LSX128:$vj, uimm3:$imm)>;
++def : Pat<(or (v8i16 LSX128:$vj), (v8i16 (vsplat_uimm_pow2 uimm4:$imm))),
++          (VBITSETI_H LSX128:$vj, uimm4:$imm)>;
++def : Pat<(or (v4i32 LSX128:$vj), (v4i32 (vsplat_uimm_pow2 uimm5:$imm))),
++          (VBITSETI_W LSX128:$vj, uimm5:$imm)>;
++def : Pat<(or (v2i64 LSX128:$vj), (v2i64 (vsplat_uimm_pow2 uimm6:$imm))),
++          (VBITSETI_D LSX128:$vj, uimm6:$imm)>;
++
++// VBITREV_{B/H/W/D}
++def : Pat<(xor v16i8:$vj, (shl vsplat_imm_eq_1, v16i8:$vk)),
++          (v16i8 (VBITREV_B v16i8:$vj, v16i8:$vk))>;
++def : Pat<(xor v8i16:$vj, (shl vsplat_imm_eq_1, v8i16:$vk)),
++          (v8i16 (VBITREV_H v8i16:$vj, v8i16:$vk))>;
++def : Pat<(xor v4i32:$vj, (shl vsplat_imm_eq_1, v4i32:$vk)),
++          (v4i32 (VBITREV_W v4i32:$vj, v4i32:$vk))>;
++def : Pat<(xor v2i64:$vj, (shl vsplat_imm_eq_1, v2i64:$vk)),
++          (v2i64 (VBITREV_D v2i64:$vj, v2i64:$vk))>;
++def : Pat<(xor v16i8:$vj, (shl vsplat_imm_eq_1, (vsplati8imm7 v16i8:$vk))),
++          (v16i8 (VBITREV_B v16i8:$vj, v16i8:$vk))>;
++def : Pat<(xor v8i16:$vj, (shl vsplat_imm_eq_1, (vsplati16imm15 v8i16:$vk))),
++          (v8i16 (VBITREV_H v8i16:$vj, v8i16:$vk))>;
++def : Pat<(xor v4i32:$vj, (shl vsplat_imm_eq_1, (vsplati32imm31 v4i32:$vk))),
++          (v4i32 (VBITREV_W v4i32:$vj, v4i32:$vk))>;
++def : Pat<(xor v2i64:$vj, (shl vsplat_imm_eq_1, (vsplati64imm63 v2i64:$vk))),
++          (v2i64 (VBITREV_D v2i64:$vj, v2i64:$vk))>;
++
++// VBITREVI_{B/H/W/D}
++def : Pat<(xor (v16i8 LSX128:$vj), (v16i8 (vsplat_uimm_pow2 uimm3:$imm))),
++          (VBITREVI_B LSX128:$vj, uimm3:$imm)>;
++def : Pat<(xor (v8i16 LSX128:$vj), (v8i16 (vsplat_uimm_pow2 uimm4:$imm))),
++          (VBITREVI_H LSX128:$vj, uimm4:$imm)>;
++def : Pat<(xor (v4i32 LSX128:$vj), (v4i32 (vsplat_uimm_pow2 uimm5:$imm))),
++          (VBITREVI_W LSX128:$vj, uimm5:$imm)>;
++def : Pat<(xor (v2i64 LSX128:$vj), (v2i64 (vsplat_uimm_pow2 uimm6:$imm))),
++          (VBITREVI_D LSX128:$vj, uimm6:$imm)>;
++
++// VFADD_{S/D}
++defm : PatVrVrF<fadd, "VFADD">;
++
++// VFSUB_{S/D}
++defm : PatVrVrF<fsub, "VFSUB">;
++
++// VFMUL_{S/D}
++defm : PatVrVrF<fmul, "VFMUL">;
++
++// VFDIV_{S/D}
++defm : PatVrVrF<fdiv, "VFDIV">;
++
++// VFMADD_{S/D}
++def : Pat<(fma v4f32:$vj, v4f32:$vk, v4f32:$va),
++          (VFMADD_S v4f32:$vj, v4f32:$vk, v4f32:$va)>;
++def : Pat<(fma v2f64:$vj, v2f64:$vk, v2f64:$va),
++          (VFMADD_D v2f64:$vj, v2f64:$vk, v2f64:$va)>;
++
++// VINSGR2VR_{B/H/W/D}
++def : Pat<(vector_insert v16i8:$vd, GRLenVT:$rj, uimm4:$imm),
++          (VINSGR2VR_B v16i8:$vd, GRLenVT:$rj, uimm4:$imm)>;
++def : Pat<(vector_insert v8i16:$vd, GRLenVT:$rj, uimm3:$imm),
++          (VINSGR2VR_H v8i16:$vd, GRLenVT:$rj, uimm3:$imm)>;
++def : Pat<(vector_insert v4i32:$vd, GRLenVT:$rj, uimm2:$imm),
++          (VINSGR2VR_W v4i32:$vd, GRLenVT:$rj, uimm2:$imm)>;
++def : Pat<(vector_insert v2i64:$vd, GRLenVT:$rj, uimm1:$imm),
++          (VINSGR2VR_D v2i64:$vd, GRLenVT:$rj, uimm1:$imm)>;
++
++// VPICKVE2GR_{B/H/W}[U]
++def : Pat<(loongarch_vpick_sext_elt v16i8:$vd, uimm4:$imm, i8),
++          (VPICKVE2GR_B v16i8:$vd, uimm4:$imm)>;
++def : Pat<(loongarch_vpick_sext_elt v8i16:$vd, uimm3:$imm, i16),
++          (VPICKVE2GR_H v8i16:$vd, uimm3:$imm)>;
++def : Pat<(loongarch_vpick_sext_elt v4i32:$vd, uimm2:$imm, i32),
++          (VPICKVE2GR_W v4i32:$vd, uimm2:$imm)>;
++
++def : Pat<(loongarch_vpick_zext_elt v16i8:$vd, uimm4:$imm, i8),
++          (VPICKVE2GR_BU v16i8:$vd, uimm4:$imm)>;
++def : Pat<(loongarch_vpick_zext_elt v8i16:$vd, uimm3:$imm, i16),
++          (VPICKVE2GR_HU v8i16:$vd, uimm3:$imm)>;
++def : Pat<(loongarch_vpick_zext_elt v4i32:$vd, uimm2:$imm, i32),
++          (VPICKVE2GR_WU v4i32:$vd, uimm2:$imm)>;
++
++// VREPLGR2VR_{B/H/W/D}
++def : Pat<(lsxsplati8 GPR:$rj), (VREPLGR2VR_B GPR:$rj)>;
++def : Pat<(lsxsplati16 GPR:$rj), (VREPLGR2VR_H GPR:$rj)>;
++def : Pat<(lsxsplati32 GPR:$rj), (VREPLGR2VR_W GPR:$rj)>;
++def : Pat<(lsxsplati64 GPR:$rj), (VREPLGR2VR_D GPR:$rj)>;
++
++// VREPLVE_{B/H/W/D}
++def : Pat<(loongarch_vreplve v16i8:$vj, GRLenVT:$rk),
++          (VREPLVE_B v16i8:$vj, GRLenVT:$rk)>;
++def : Pat<(loongarch_vreplve v8i16:$vj, GRLenVT:$rk),
++          (VREPLVE_H v8i16:$vj, GRLenVT:$rk)>;
++def : Pat<(loongarch_vreplve v4i32:$vj, GRLenVT:$rk),
++          (VREPLVE_W v4i32:$vj, GRLenVT:$rk)>;
++def : Pat<(loongarch_vreplve v2i64:$vj, GRLenVT:$rk),
++          (VREPLVE_D v2i64:$vj, GRLenVT:$rk)>;
++
++// Loads/Stores
++foreach vt = [v16i8, v8i16, v4i32, v2i64] in {
++  defm : LdPat<load, VLD, vt>;
++  def  : RegRegLdPat<load, VLDX, vt>;
++  defm : StPat<store, VST, LSX128, vt>;
++  def  : RegRegStPat<store, VSTX, LSX128, vt>;
++}
++
++} // Predicates = [HasExtLSX]
++
++/// Intrinsic pattern
++
++class deriveLSXIntrinsic<string Inst> {
++  Intrinsic ret = !cast<Intrinsic>(!tolower("int_loongarch_lsx_"#Inst));
++}
++
++let Predicates = [HasExtLSX] in {
++
++// vty: v16i8/v8i16/v4i32/v2i64
++// Pat<(Intrinsic vty:$vj, vty:$vk),
++//     (LAInst vty:$vj, vty:$vk)>;
++foreach Inst = ["VSADD_B", "VSADD_BU", "VSSUB_B", "VSSUB_BU",
++                "VHADDW_H_B", "VHADDW_HU_BU", "VHSUBW_H_B", "VHSUBW_HU_BU",
++                "VADDWEV_H_B", "VADDWOD_H_B", "VSUBWEV_H_B", "VSUBWOD_H_B",
++                "VADDWEV_H_BU", "VADDWOD_H_BU", "VSUBWEV_H_BU", "VSUBWOD_H_BU",
++                "VADDWEV_H_BU_B", "VADDWOD_H_BU_B",
++                "VAVG_B", "VAVG_BU", "VAVGR_B", "VAVGR_BU",
++                "VABSD_B", "VABSD_BU", "VADDA_B", "VMUH_B", "VMUH_BU",
++                "VMULWEV_H_B", "VMULWOD_H_B", "VMULWEV_H_BU", "VMULWOD_H_BU",
++                "VMULWEV_H_BU_B", "VMULWOD_H_BU_B", "VSIGNCOV_B",
++                "VANDN_V", "VORN_V", "VROTR_B", "VSRLR_B", "VSRAR_B",
++                "VSEQ_B", "VSLE_B", "VSLE_BU", "VSLT_B", "VSLT_BU",
++                "VPACKEV_B", "VPACKOD_B", "VPICKEV_B", "VPICKOD_B",
++                "VILVL_B", "VILVH_B"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v16i8 LSX128:$vj), (v16i8 LSX128:$vk)),
++            (!cast<LAInst>(Inst) LSX128:$vj, LSX128:$vk)>;
++foreach Inst = ["VSADD_H", "VSADD_HU", "VSSUB_H", "VSSUB_HU",
++                "VHADDW_W_H", "VHADDW_WU_HU", "VHSUBW_W_H", "VHSUBW_WU_HU",
++                "VADDWEV_W_H", "VADDWOD_W_H", "VSUBWEV_W_H", "VSUBWOD_W_H",
++                "VADDWEV_W_HU", "VADDWOD_W_HU", "VSUBWEV_W_HU", "VSUBWOD_W_HU",
++                "VADDWEV_W_HU_H", "VADDWOD_W_HU_H",
++                "VAVG_H", "VAVG_HU", "VAVGR_H", "VAVGR_HU",
++                "VABSD_H", "VABSD_HU", "VADDA_H", "VMUH_H", "VMUH_HU",
++                "VMULWEV_W_H", "VMULWOD_W_H", "VMULWEV_W_HU", "VMULWOD_W_HU",
++                "VMULWEV_W_HU_H", "VMULWOD_W_HU_H", "VSIGNCOV_H", "VROTR_H",
++                "VSRLR_H", "VSRAR_H", "VSRLN_B_H", "VSRAN_B_H", "VSRLRN_B_H",
++                "VSRARN_B_H", "VSSRLN_B_H", "VSSRAN_B_H", "VSSRLN_BU_H",
++                "VSSRAN_BU_H", "VSSRLRN_B_H", "VSSRARN_B_H", "VSSRLRN_BU_H",
++                "VSSRARN_BU_H",
++                "VSEQ_H", "VSLE_H", "VSLE_HU", "VSLT_H", "VSLT_HU",
++                "VPACKEV_H", "VPACKOD_H", "VPICKEV_H", "VPICKOD_H",
++                "VILVL_H", "VILVH_H"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v8i16 LSX128:$vj), (v8i16 LSX128:$vk)),
++            (!cast<LAInst>(Inst) LSX128:$vj, LSX128:$vk)>;
++foreach Inst = ["VSADD_W", "VSADD_WU", "VSSUB_W", "VSSUB_WU",
++                "VHADDW_D_W", "VHADDW_DU_WU", "VHSUBW_D_W", "VHSUBW_DU_WU",
++                "VADDWEV_D_W", "VADDWOD_D_W", "VSUBWEV_D_W", "VSUBWOD_D_W",
++                "VADDWEV_D_WU", "VADDWOD_D_WU", "VSUBWEV_D_WU", "VSUBWOD_D_WU",
++                "VADDWEV_D_WU_W", "VADDWOD_D_WU_W",
++                "VAVG_W", "VAVG_WU", "VAVGR_W", "VAVGR_WU",
++                "VABSD_W", "VABSD_WU", "VADDA_W", "VMUH_W", "VMUH_WU",
++                "VMULWEV_D_W", "VMULWOD_D_W", "VMULWEV_D_WU", "VMULWOD_D_WU",
++                "VMULWEV_D_WU_W", "VMULWOD_D_WU_W", "VSIGNCOV_W", "VROTR_W",
++                "VSRLR_W", "VSRAR_W", "VSRLN_H_W", "VSRAN_H_W", "VSRLRN_H_W",
++                "VSRARN_H_W", "VSSRLN_H_W", "VSSRAN_H_W", "VSSRLN_HU_W",
++                "VSSRAN_HU_W", "VSSRLRN_H_W", "VSSRARN_H_W", "VSSRLRN_HU_W",
++                "VSSRARN_HU_W",
++                "VSEQ_W", "VSLE_W", "VSLE_WU", "VSLT_W", "VSLT_WU",
++                "VPACKEV_W", "VPACKOD_W", "VPICKEV_W", "VPICKOD_W",
++                "VILVL_W", "VILVH_W"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v4i32 LSX128:$vj), (v4i32 LSX128:$vk)),
++            (!cast<LAInst>(Inst) LSX128:$vj, LSX128:$vk)>;
++foreach Inst = ["VADD_Q", "VSUB_Q",
++                "VSADD_D", "VSADD_DU", "VSSUB_D", "VSSUB_DU",
++                "VHADDW_Q_D", "VHADDW_QU_DU", "VHSUBW_Q_D", "VHSUBW_QU_DU",
++                "VADDWEV_Q_D", "VADDWOD_Q_D", "VSUBWEV_Q_D", "VSUBWOD_Q_D",
++                "VADDWEV_Q_DU", "VADDWOD_Q_DU", "VSUBWEV_Q_DU", "VSUBWOD_Q_DU",
++                "VADDWEV_Q_DU_D", "VADDWOD_Q_DU_D",
++                "VAVG_D", "VAVG_DU", "VAVGR_D", "VAVGR_DU",
++                "VABSD_D", "VABSD_DU", "VADDA_D", "VMUH_D", "VMUH_DU",
++                "VMULWEV_Q_D", "VMULWOD_Q_D", "VMULWEV_Q_DU", "VMULWOD_Q_DU",
++                "VMULWEV_Q_DU_D", "VMULWOD_Q_DU_D", "VSIGNCOV_D", "VROTR_D",
++                "VSRLR_D", "VSRAR_D", "VSRLN_W_D", "VSRAN_W_D", "VSRLRN_W_D",
++                "VSRARN_W_D", "VSSRLN_W_D", "VSSRAN_W_D", "VSSRLN_WU_D",
++                "VSSRAN_WU_D", "VSSRLRN_W_D", "VSSRARN_W_D", "VSSRLRN_WU_D",
++                "VSSRARN_WU_D", "VFFINT_S_L",
++                "VSEQ_D", "VSLE_D", "VSLE_DU", "VSLT_D", "VSLT_DU",
++                "VPACKEV_D", "VPACKOD_D", "VPICKEV_D", "VPICKOD_D",
++                "VILVL_D", "VILVH_D"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v2i64 LSX128:$vj), (v2i64 LSX128:$vk)),
++            (!cast<LAInst>(Inst) LSX128:$vj, LSX128:$vk)>;
++
++// vty: v16i8/v8i16/v4i32/v2i64
++// Pat<(Intrinsic vty:$vd, vty:$vj, vty:$vk),
++//     (LAInst vty:$vd, vty:$vj, vty:$vk)>;
++foreach Inst = ["VMADDWEV_H_B", "VMADDWOD_H_B", "VMADDWEV_H_BU",
++                "VMADDWOD_H_BU", "VMADDWEV_H_BU_B", "VMADDWOD_H_BU_B"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v8i16 LSX128:$vd), (v16i8 LSX128:$vj), (v16i8 LSX128:$vk)),
++            (!cast<LAInst>(Inst) LSX128:$vd, LSX128:$vj, LSX128:$vk)>;
++foreach Inst = ["VMADDWEV_W_H", "VMADDWOD_W_H", "VMADDWEV_W_HU",
++                "VMADDWOD_W_HU", "VMADDWEV_W_HU_H", "VMADDWOD_W_HU_H"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v4i32 LSX128:$vd), (v8i16 LSX128:$vj), (v8i16 LSX128:$vk)),
++            (!cast<LAInst>(Inst) LSX128:$vd, LSX128:$vj, LSX128:$vk)>;
++foreach Inst = ["VMADDWEV_D_W", "VMADDWOD_D_W", "VMADDWEV_D_WU",
++                "VMADDWOD_D_WU", "VMADDWEV_D_WU_W", "VMADDWOD_D_WU_W"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v2i64 LSX128:$vd), (v4i32 LSX128:$vj), (v4i32 LSX128:$vk)),
++            (!cast<LAInst>(Inst) LSX128:$vd, LSX128:$vj, LSX128:$vk)>;
++foreach Inst = ["VMADDWEV_Q_D", "VMADDWOD_Q_D", "VMADDWEV_Q_DU",
++                "VMADDWOD_Q_DU", "VMADDWEV_Q_DU_D", "VMADDWOD_Q_DU_D"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v2i64 LSX128:$vd), (v2i64 LSX128:$vj), (v2i64 LSX128:$vk)),
++            (!cast<LAInst>(Inst) LSX128:$vd, LSX128:$vj, LSX128:$vk)>;
++
++// vty: v16i8/v8i16/v4i32/v2i64
++// Pat<(Intrinsic vty:$vj),
++//     (LAInst vty:$vj)>;
++foreach Inst = ["VEXTH_H_B", "VEXTH_HU_BU",
++                "VMSKLTZ_B", "VMSKGEZ_B", "VMSKNZ_B",
++                "VCLO_B", "VCLZ_B"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v16i8 LSX128:$vj)),
++            (!cast<LAInst>(Inst) LSX128:$vj)>;
++foreach Inst = ["VEXTH_W_H", "VEXTH_WU_HU", "VMSKLTZ_H",
++                "VCLO_H", "VCLZ_H", "VFCVTL_S_H", "VFCVTH_S_H"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v8i16 LSX128:$vj)),
++            (!cast<LAInst>(Inst) LSX128:$vj)>;
++foreach Inst = ["VEXTH_D_W", "VEXTH_DU_WU", "VMSKLTZ_W",
++                "VCLO_W", "VCLZ_W", "VFFINT_S_W", "VFFINT_S_WU",
++                "VFFINTL_D_W", "VFFINTH_D_W"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v4i32 LSX128:$vj)),
++            (!cast<LAInst>(Inst) LSX128:$vj)>;
++foreach Inst = ["VEXTH_Q_D", "VEXTH_QU_DU", "VMSKLTZ_D",
++                "VEXTL_Q_D", "VEXTL_QU_DU",
++                "VCLO_D", "VCLZ_D", "VFFINT_D_L", "VFFINT_D_LU"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2i64 LSX128:$vj)),
++            (!cast<LAInst>(Inst) LSX128:$vj)>;
++
++// Pat<(Intrinsic timm:$imm)
++//     (LAInst timm:$imm)>;
++def : Pat<(int_loongarch_lsx_vldi timm:$imm),
++          (VLDI (to_valide_timm timm:$imm))>;
++foreach Inst = ["VREPLI_B", "VREPLI_H", "VREPLI_W", "VREPLI_D"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret timm:$imm),
++            (!cast<LAInst>("Pseudo"#Inst) (to_valide_timm timm:$imm))>;
++
++// vty: v16i8/v8i16/v4i32/v2i64
++// Pat<(Intrinsic vty:$vj, timm:$imm)
++//     (LAInst vty:$vj, timm:$imm)>;
++foreach Inst = ["VSAT_B", "VSAT_BU", "VNORI_B", "VROTRI_B", "VSLLWIL_H_B",
++                "VSLLWIL_HU_BU", "VSRLRI_B", "VSRARI_B",
++                "VSEQI_B", "VSLEI_B", "VSLEI_BU", "VSLTI_B", "VSLTI_BU",
++                "VREPLVEI_B", "VBSLL_V", "VBSRL_V", "VSHUF4I_B"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v16i8 LSX128:$vj), timm:$imm),
++            (!cast<LAInst>(Inst) LSX128:$vj, (to_valide_timm timm:$imm))>;
++foreach Inst = ["VSAT_H", "VSAT_HU", "VROTRI_H", "VSLLWIL_W_H",
++                "VSLLWIL_WU_HU", "VSRLRI_H", "VSRARI_H",
++                "VSEQI_H", "VSLEI_H", "VSLEI_HU", "VSLTI_H", "VSLTI_HU",
++                "VREPLVEI_H", "VSHUF4I_H"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v8i16 LSX128:$vj), timm:$imm),
++            (!cast<LAInst>(Inst) LSX128:$vj, (to_valide_timm timm:$imm))>;
++foreach Inst = ["VSAT_W", "VSAT_WU", "VROTRI_W", "VSLLWIL_D_W",
++                "VSLLWIL_DU_WU", "VSRLRI_W", "VSRARI_W",
++                "VSEQI_W", "VSLEI_W", "VSLEI_WU", "VSLTI_W", "VSLTI_WU",
++                "VREPLVEI_W", "VSHUF4I_W"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v4i32 LSX128:$vj), timm:$imm),
++            (!cast<LAInst>(Inst) LSX128:$vj, (to_valide_timm timm:$imm))>;
++foreach Inst = ["VSAT_D", "VSAT_DU", "VROTRI_D", "VSRLRI_D", "VSRARI_D",
++                "VSEQI_D", "VSLEI_D", "VSLEI_DU", "VSLTI_D", "VSLTI_DU",
++                "VPICKVE2GR_D", "VPICKVE2GR_DU",
++                "VREPLVEI_D"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2i64 LSX128:$vj), timm:$imm),
++            (!cast<LAInst>(Inst) LSX128:$vj, (to_valide_timm timm:$imm))>;
++
++// vty: v16i8/v8i16/v4i32/v2i64
++// Pat<(Intrinsic vty:$vd, vty:$vj, timm:$imm)
++//     (LAInst vty:$vd, vty:$vj, timm:$imm)>;
++foreach Inst = ["VSRLNI_B_H", "VSRANI_B_H", "VSRLRNI_B_H", "VSRARNI_B_H",
++                "VSSRLNI_B_H", "VSSRANI_B_H", "VSSRLNI_BU_H", "VSSRANI_BU_H",
++                "VSSRLRNI_B_H", "VSSRARNI_B_H", "VSSRLRNI_BU_H", "VSSRARNI_BU_H",
++                "VFRSTPI_B", "VBITSELI_B", "VEXTRINS_B"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v16i8 LSX128:$vd), (v16i8 LSX128:$vj), timm:$imm),
++            (!cast<LAInst>(Inst) LSX128:$vd, LSX128:$vj,
++               (to_valide_timm timm:$imm))>;
++foreach Inst = ["VSRLNI_H_W", "VSRANI_H_W", "VSRLRNI_H_W", "VSRARNI_H_W",
++                "VSSRLNI_H_W", "VSSRANI_H_W", "VSSRLNI_HU_W", "VSSRANI_HU_W",
++                "VSSRLRNI_H_W", "VSSRARNI_H_W", "VSSRLRNI_HU_W", "VSSRARNI_HU_W",
++                "VFRSTPI_H", "VEXTRINS_H"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v8i16 LSX128:$vd), (v8i16 LSX128:$vj), timm:$imm),
++            (!cast<LAInst>(Inst) LSX128:$vd, LSX128:$vj,
++               (to_valide_timm timm:$imm))>;
++foreach Inst = ["VSRLNI_W_D", "VSRANI_W_D", "VSRLRNI_W_D", "VSRARNI_W_D",
++                "VSSRLNI_W_D", "VSSRANI_W_D", "VSSRLNI_WU_D", "VSSRANI_WU_D",
++                "VSSRLRNI_W_D", "VSSRARNI_W_D", "VSSRLRNI_WU_D", "VSSRARNI_WU_D",
++                "VPERMI_W", "VEXTRINS_W"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v4i32 LSX128:$vd), (v4i32 LSX128:$vj), timm:$imm),
++            (!cast<LAInst>(Inst) LSX128:$vd, LSX128:$vj,
++               (to_valide_timm timm:$imm))>;
++foreach Inst = ["VSRLNI_D_Q", "VSRANI_D_Q", "VSRLRNI_D_Q", "VSRARNI_D_Q",
++                "VSSRLNI_D_Q", "VSSRANI_D_Q", "VSSRLNI_DU_Q", "VSSRANI_DU_Q",
++                "VSSRLRNI_D_Q", "VSSRARNI_D_Q", "VSSRLRNI_DU_Q", "VSSRARNI_DU_Q",
++                "VSHUF4I_D", "VEXTRINS_D"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v2i64 LSX128:$vd), (v2i64 LSX128:$vj), timm:$imm),
++            (!cast<LAInst>(Inst) LSX128:$vd, LSX128:$vj,
++               (to_valide_timm timm:$imm))>;
++
++// vty: v16i8/v8i16/v4i32/v2i64
++// Pat<(Intrinsic vty:$vd, vty:$vj, vty:$vk),
++//     (LAInst vty:$vd, vty:$vj, vty:$vk)>;
++foreach Inst = ["VFRSTP_B", "VBITSEL_V", "VSHUF_B"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v16i8 LSX128:$vd), (v16i8 LSX128:$vj), (v16i8 LSX128:$vk)),
++            (!cast<LAInst>(Inst) LSX128:$vd, LSX128:$vj, LSX128:$vk)>;
++foreach Inst = ["VFRSTP_H", "VSHUF_H"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v8i16 LSX128:$vd), (v8i16 LSX128:$vj), (v8i16 LSX128:$vk)),
++            (!cast<LAInst>(Inst) LSX128:$vd, LSX128:$vj, LSX128:$vk)>;
++def : Pat<(int_loongarch_lsx_vshuf_w (v4i32 LSX128:$vd), (v4i32 LSX128:$vj),
++                                     (v4i32 LSX128:$vk)),
++          (VSHUF_W LSX128:$vd, LSX128:$vj, LSX128:$vk)>;
++def : Pat<(int_loongarch_lsx_vshuf_d (v2i64 LSX128:$vd), (v2i64 LSX128:$vj),
++                                     (v2i64 LSX128:$vk)),
++          (VSHUF_D LSX128:$vd, LSX128:$vj, LSX128:$vk)>;
++
++// vty: v4f32/v2f64
++// Pat<(Intrinsic vty:$vj, vty:$vk, vty:$va),
++//     (LAInst vty:$vj, vty:$vk, vty:$va)>;
++foreach Inst = ["VFMSUB_S", "VFNMADD_S", "VFNMSUB_S"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v4f32 LSX128:$vj), (v4f32 LSX128:$vk), (v4f32 LSX128:$va)),
++            (!cast<LAInst>(Inst) LSX128:$vj, LSX128:$vk, LSX128:$va)>;
++foreach Inst = ["VFMSUB_D", "VFNMADD_D", "VFNMSUB_D"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v2f64 LSX128:$vj), (v2f64 LSX128:$vk), (v2f64 LSX128:$va)),
++            (!cast<LAInst>(Inst) LSX128:$vj, LSX128:$vk, LSX128:$va)>;
++
++// vty: v4f32/v2f64
++// Pat<(Intrinsic vty:$vj, vty:$vk),
++//     (LAInst vty:$vj, vty:$vk)>;
++foreach Inst = ["VFMAX_S", "VFMIN_S", "VFMAXA_S", "VFMINA_S", "VFCVT_H_S",
++                "VFCMP_CAF_S", "VFCMP_CUN_S", "VFCMP_CEQ_S", "VFCMP_CUEQ_S",
++                "VFCMP_CLT_S", "VFCMP_CULT_S", "VFCMP_CLE_S", "VFCMP_CULE_S",
++                "VFCMP_CNE_S", "VFCMP_COR_S", "VFCMP_CUNE_S",
++                "VFCMP_SAF_S", "VFCMP_SUN_S", "VFCMP_SEQ_S", "VFCMP_SUEQ_S",
++                "VFCMP_SLT_S", "VFCMP_SULT_S", "VFCMP_SLE_S", "VFCMP_SULE_S",
++                "VFCMP_SNE_S", "VFCMP_SOR_S", "VFCMP_SUNE_S"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v4f32 LSX128:$vj), (v4f32 LSX128:$vk)),
++            (!cast<LAInst>(Inst) LSX128:$vj, LSX128:$vk)>;
++foreach Inst = ["VFMAX_D", "VFMIN_D", "VFMAXA_D", "VFMINA_D", "VFCVT_S_D",
++                "VFTINTRNE_W_D", "VFTINTRZ_W_D", "VFTINTRP_W_D", "VFTINTRM_W_D",
++                "VFTINT_W_D",
++                "VFCMP_CAF_D", "VFCMP_CUN_D", "VFCMP_CEQ_D", "VFCMP_CUEQ_D",
++                "VFCMP_CLT_D", "VFCMP_CULT_D", "VFCMP_CLE_D", "VFCMP_CULE_D",
++                "VFCMP_CNE_D", "VFCMP_COR_D", "VFCMP_CUNE_D",
++                "VFCMP_SAF_D", "VFCMP_SUN_D", "VFCMP_SEQ_D", "VFCMP_SUEQ_D",
++                "VFCMP_SLT_D", "VFCMP_SULT_D", "VFCMP_SLE_D", "VFCMP_SULE_D",
++                "VFCMP_SNE_D", "VFCMP_SOR_D", "VFCMP_SUNE_D"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret
++               (v2f64 LSX128:$vj), (v2f64 LSX128:$vk)),
++            (!cast<LAInst>(Inst) LSX128:$vj, LSX128:$vk)>;
++
++// vty: v4f32/v2f64
++// Pat<(Intrinsic vty:$vj),
++//     (LAInst vty:$vj)>;
++foreach Inst = ["VFLOGB_S", "VFCLASS_S", "VFSQRT_S", "VFRECIP_S", "VFRSQRT_S",
++                "VFRINT_S", "VFCVTL_D_S", "VFCVTH_D_S",
++                "VFRINTRNE_S", "VFRINTRZ_S", "VFRINTRP_S", "VFRINTRM_S",
++                "VFTINTRNE_W_S", "VFTINTRZ_W_S", "VFTINTRP_W_S", "VFTINTRM_W_S",
++                "VFTINT_W_S", "VFTINTRZ_WU_S", "VFTINT_WU_S",
++                "VFTINTRNEL_L_S", "VFTINTRNEH_L_S", "VFTINTRZL_L_S",
++                "VFTINTRZH_L_S", "VFTINTRPL_L_S", "VFTINTRPH_L_S",
++                "VFTINTRML_L_S", "VFTINTRMH_L_S", "VFTINTL_L_S",
++                "VFTINTH_L_S"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v4f32 LSX128:$vj)),
++            (!cast<LAInst>(Inst) LSX128:$vj)>;
++foreach Inst = ["VFLOGB_D", "VFCLASS_D", "VFSQRT_D", "VFRECIP_D", "VFRSQRT_D",
++                "VFRINT_D",
++                "VFRINTRNE_D", "VFRINTRZ_D", "VFRINTRP_D", "VFRINTRM_D",
++                "VFTINTRNE_L_D", "VFTINTRZ_L_D", "VFTINTRP_L_D", "VFTINTRM_L_D",
++                "VFTINT_L_D", "VFTINTRZ_LU_D", "VFTINT_LU_D"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2f64 LSX128:$vj)),
++            (!cast<LAInst>(Inst) LSX128:$vj)>;
++
++// load
++def : Pat<(int_loongarch_lsx_vld GPR:$rj, timm:$imm),
++          (VLD GPR:$rj, (to_valide_timm timm:$imm))>;
++def : Pat<(int_loongarch_lsx_vldx GPR:$rj, GPR:$rk),
++          (VLDX GPR:$rj, GPR:$rk)>;
++
++def : Pat<(int_loongarch_lsx_vldrepl_b GPR:$rj, timm:$imm),
++          (VLDREPL_B GPR:$rj, (to_valide_timm timm:$imm))>;
++def : Pat<(int_loongarch_lsx_vldrepl_h GPR:$rj, timm:$imm),
++          (VLDREPL_H GPR:$rj, (to_valide_timm timm:$imm))>;
++def : Pat<(int_loongarch_lsx_vldrepl_w GPR:$rj, timm:$imm),
++          (VLDREPL_W GPR:$rj, (to_valide_timm timm:$imm))>;
++def : Pat<(int_loongarch_lsx_vldrepl_d GPR:$rj, timm:$imm),
++          (VLDREPL_D GPR:$rj, (to_valide_timm timm:$imm))>;
++
++// store
++def : Pat<(int_loongarch_lsx_vst LSX128:$vd, GPR:$rj, timm:$imm),
++          (VST LSX128:$vd, GPR:$rj, (to_valide_timm timm:$imm))>;
++def : Pat<(int_loongarch_lsx_vstx LSX128:$vd, GPR:$rj, GPR:$rk),
++          (VSTX LSX128:$vd, GPR:$rj, GPR:$rk)>;
++
++def : Pat<(int_loongarch_lsx_vstelm_b v16i8:$vd, GPR:$rj, timm:$imm, timm:$idx),
++          (VSTELM_B v16i8:$vd, GPR:$rj, (to_valide_timm timm:$imm),
++                    (to_valide_timm timm:$idx))>;
++def : Pat<(int_loongarch_lsx_vstelm_h v8i16:$vd, GPR:$rj, timm:$imm, timm:$idx),
++          (VSTELM_H v8i16:$vd, GPR:$rj, (to_valide_timm timm:$imm),
++                    (to_valide_timm timm:$idx))>;
++def : Pat<(int_loongarch_lsx_vstelm_w v4i32:$vd, GPR:$rj, timm:$imm, timm:$idx),
++          (VSTELM_W v4i32:$vd, GPR:$rj, (to_valide_timm timm:$imm),
++                    (to_valide_timm timm:$idx))>;
++def : Pat<(int_loongarch_lsx_vstelm_d v2i64:$vd, GPR:$rj, timm:$imm, timm:$idx),
++          (VSTELM_D v2i64:$vd, GPR:$rj, (to_valide_timm timm:$imm),
++                    (to_valide_timm timm:$idx))>;
++
+ } // Predicates = [HasExtLSX]
+-- 
+2.20.1
+
+
+From 6f813b014a5df84162cc182994d597674d433a9a Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Sat, 19 Aug 2023 16:53:50 +0800
+Subject: [PATCH 03/35] [LoongArch] Add LASX intrinsic support
+
+This patch is similar to D155829.
+
+Depends on D155829
+
+Reviewed By: SixWeining
+
+Differential Revision: https://reviews.llvm.org/D155830
+
+(cherry picked from commit 691f0d00b84f6ecaf8e341ef38256e939cca6b1e)
+---
+ llvm/include/llvm/IR/IntrinsicsLoongArch.td   | 523 +++++++++++++
+ .../LoongArch/LoongArchISelLowering.cpp       | 402 +++++++++-
+ .../Target/LoongArch/LoongArchInstrInfo.cpp   |  12 +
+ .../LoongArch/LoongArchLASXInstrInfo.td       | 702 ++++++++++++++++++
+ 4 files changed, 1633 insertions(+), 6 deletions(-)
+
+diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+index d39d8261ebe3..685deaec7709 100644
+--- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td
++++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+@@ -647,3 +647,526 @@ def int_loongarch_lsx_vstelm_d
+            [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+ 
+ } // TargetPrefix = "loongarch"
++
++//===----------------------------------------------------------------------===//
++// LASX
++
++let TargetPrefix = "loongarch" in {
++foreach inst = ["xvadd_b", "xvsub_b",
++                "xvsadd_b", "xvsadd_bu", "xvssub_b", "xvssub_bu",
++                "xvavg_b", "xvavg_bu", "xvavgr_b", "xvavgr_bu",
++                "xvabsd_b", "xvabsd_bu", "xvadda_b",
++                "xvmax_b", "xvmax_bu", "xvmin_b", "xvmin_bu",
++                "xvmul_b", "xvmuh_b", "xvmuh_bu",
++                "xvdiv_b", "xvdiv_bu", "xvmod_b", "xvmod_bu", "xvsigncov_b",
++                "xvand_v", "xvor_v", "xvxor_v", "xvnor_v", "xvandn_v", "xvorn_v",
++                "xvsll_b", "xvsrl_b", "xvsra_b", "xvrotr_b", "xvsrlr_b", "xvsrar_b",
++                "xvbitclr_b", "xvbitset_b", "xvbitrev_b",
++                "xvseq_b", "xvsle_b", "xvsle_bu", "xvslt_b", "xvslt_bu",
++                "xvpackev_b", "xvpackod_b", "xvpickev_b", "xvpickod_b",
++                "xvilvl_b", "xvilvh_b"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v32i8_ty],
++                                        [llvm_v32i8_ty, llvm_v32i8_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvadd_h", "xvsub_h",
++                "xvsadd_h", "xvsadd_hu", "xvssub_h", "xvssub_hu",
++                "xvavg_h", "xvavg_hu", "xvavgr_h", "xvavgr_hu",
++                "xvabsd_h", "xvabsd_hu", "xvadda_h",
++                "xvmax_h", "xvmax_hu", "xvmin_h", "xvmin_hu",
++                "xvmul_h", "xvmuh_h", "xvmuh_hu",
++                "xvdiv_h", "xvdiv_hu", "xvmod_h", "xvmod_hu", "xvsigncov_h",
++                "xvsll_h", "xvsrl_h", "xvsra_h", "xvrotr_h", "xvsrlr_h", "xvsrar_h",
++                "xvbitclr_h", "xvbitset_h", "xvbitrev_h",
++                "xvseq_h", "xvsle_h", "xvsle_hu", "xvslt_h", "xvslt_hu",
++                "xvpackev_h", "xvpackod_h", "xvpickev_h", "xvpickod_h",
++                "xvilvl_h", "xvilvh_h"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v16i16_ty],
++                                        [llvm_v16i16_ty, llvm_v16i16_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvadd_w", "xvsub_w",
++                "xvsadd_w", "xvsadd_wu", "xvssub_w", "xvssub_wu",
++                "xvavg_w", "xvavg_wu", "xvavgr_w", "xvavgr_wu",
++                "xvabsd_w", "xvabsd_wu", "xvadda_w",
++                "xvmax_w", "xvmax_wu", "xvmin_w", "xvmin_wu",
++                "xvmul_w", "xvmuh_w", "xvmuh_wu",
++                "xvdiv_w", "xvdiv_wu", "xvmod_w", "xvmod_wu", "xvsigncov_w",
++                "xvsll_w", "xvsrl_w", "xvsra_w", "xvrotr_w", "xvsrlr_w", "xvsrar_w",
++                "xvbitclr_w", "xvbitset_w", "xvbitrev_w",
++                "xvseq_w", "xvsle_w", "xvsle_wu", "xvslt_w", "xvslt_wu",
++                "xvpackev_w", "xvpackod_w", "xvpickev_w", "xvpickod_w",
++                "xvilvl_w", "xvilvh_w", "xvperm_w"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v8i32_ty],
++                                        [llvm_v8i32_ty, llvm_v8i32_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvadd_d", "xvadd_q", "xvsub_d", "xvsub_q",
++                "xvsadd_d", "xvsadd_du", "xvssub_d", "xvssub_du",
++                "xvhaddw_q_d", "xvhaddw_qu_du", "xvhsubw_q_d", "xvhsubw_qu_du",
++                "xvaddwev_q_d", "xvaddwod_q_d", "xvsubwev_q_d", "xvsubwod_q_d",
++                "xvaddwev_q_du", "xvaddwod_q_du", "xvsubwev_q_du", "xvsubwod_q_du",
++                "xvaddwev_q_du_d", "xvaddwod_q_du_d",
++                "xvavg_d", "xvavg_du", "xvavgr_d", "xvavgr_du",
++                "xvabsd_d", "xvabsd_du", "xvadda_d",
++                "xvmax_d", "xvmax_du", "xvmin_d", "xvmin_du",
++                "xvmul_d", "xvmuh_d", "xvmuh_du",
++                "xvmulwev_q_d", "xvmulwod_q_d", "xvmulwev_q_du", "xvmulwod_q_du",
++                "xvmulwev_q_du_d", "xvmulwod_q_du_d",
++                "xvdiv_d", "xvdiv_du", "xvmod_d", "xvmod_du", "xvsigncov_d",
++                "xvsll_d", "xvsrl_d", "xvsra_d", "xvrotr_d", "xvsrlr_d", "xvsrar_d",
++                "xvbitclr_d", "xvbitset_d", "xvbitrev_d",
++                "xvseq_d", "xvsle_d", "xvsle_du", "xvslt_d", "xvslt_du",
++                "xvpackev_d", "xvpackod_d", "xvpickev_d", "xvpickod_d",
++                "xvilvl_d", "xvilvh_d"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v4i64_ty],
++                                        [llvm_v4i64_ty, llvm_v4i64_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvaddi_bu", "xvsubi_bu",
++                "xvmaxi_b", "xvmaxi_bu", "xvmini_b", "xvmini_bu",
++                "xvsat_b", "xvsat_bu",
++                "xvandi_b", "xvori_b", "xvxori_b", "xvnori_b",
++                "xvslli_b", "xvsrli_b", "xvsrai_b", "xvrotri_b",
++                "xvsrlri_b", "xvsrari_b",
++                "xvbitclri_b", "xvbitseti_b", "xvbitrevi_b",
++                "xvseqi_b", "xvslei_b", "xvslei_bu", "xvslti_b", "xvslti_bu",
++                "xvrepl128vei_b", "xvbsll_v", "xvbsrl_v", "xvshuf4i_b"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v32i8_ty],
++                                        [llvm_v32i8_ty, llvm_i32_ty],
++                                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++foreach inst = ["xvaddi_hu", "xvsubi_hu",
++                "xvmaxi_h", "xvmaxi_hu", "xvmini_h", "xvmini_hu",
++                "xvsat_h", "xvsat_hu",
++                "xvslli_h", "xvsrli_h", "xvsrai_h", "xvrotri_h",
++                "xvsrlri_h", "xvsrari_h",
++                "xvbitclri_h", "xvbitseti_h", "xvbitrevi_h",
++                "xvseqi_h", "xvslei_h", "xvslei_hu", "xvslti_h", "xvslti_hu",
++                "xvrepl128vei_h", "xvshuf4i_h"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v16i16_ty],
++                                        [llvm_v16i16_ty, llvm_i32_ty],
++                                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++foreach inst = ["xvaddi_wu", "xvsubi_wu",
++                "xvmaxi_w", "xvmaxi_wu", "xvmini_w", "xvmini_wu",
++                "xvsat_w", "xvsat_wu",
++                "xvslli_w", "xvsrli_w", "xvsrai_w", "xvrotri_w",
++                "xvsrlri_w", "xvsrari_w",
++                "xvbitclri_w", "xvbitseti_w", "xvbitrevi_w",
++                "xvseqi_w", "xvslei_w", "xvslei_wu", "xvslti_w", "xvslti_wu",
++                "xvrepl128vei_w", "xvshuf4i_w", "xvpickve_w"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v8i32_ty],
++                                        [llvm_v8i32_ty, llvm_i32_ty],
++                                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++foreach inst = ["xvaddi_du", "xvsubi_du",
++                "xvmaxi_d", "xvmaxi_du", "xvmini_d", "xvmini_du",
++                "xvsat_d", "xvsat_du",
++                "xvslli_d", "xvsrli_d", "xvsrai_d", "xvrotri_d",
++                "xvsrlri_d", "xvsrari_d",
++                "xvbitclri_d", "xvbitseti_d", "xvbitrevi_d",
++                "xvseqi_d", "xvslei_d", "xvslei_du", "xvslti_d", "xvslti_du",
++                "xvrepl128vei_d", "xvpermi_d", "xvpickve_d"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v4i64_ty],
++                                        [llvm_v4i64_ty, llvm_i32_ty],
++                                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++
++foreach inst = ["xvhaddw_h_b", "xvhaddw_hu_bu", "xvhsubw_h_b", "xvhsubw_hu_bu",
++                "xvaddwev_h_b", "xvaddwod_h_b", "xvsubwev_h_b", "xvsubwod_h_b",
++                "xvaddwev_h_bu", "xvaddwod_h_bu", "xvsubwev_h_bu", "xvsubwod_h_bu",
++                "xvaddwev_h_bu_b", "xvaddwod_h_bu_b",
++                "xvmulwev_h_b", "xvmulwod_h_b", "xvmulwev_h_bu", "xvmulwod_h_bu",
++                "xvmulwev_h_bu_b", "xvmulwod_h_bu_b"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v16i16_ty],
++                                        [llvm_v32i8_ty, llvm_v32i8_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvhaddw_w_h", "xvhaddw_wu_hu", "xvhsubw_w_h", "xvhsubw_wu_hu",
++                "xvaddwev_w_h", "xvaddwod_w_h", "xvsubwev_w_h", "xvsubwod_w_h",
++                "xvaddwev_w_hu", "xvaddwod_w_hu", "xvsubwev_w_hu", "xvsubwod_w_hu",
++                "xvaddwev_w_hu_h", "xvaddwod_w_hu_h",
++                "xvmulwev_w_h", "xvmulwod_w_h", "xvmulwev_w_hu", "xvmulwod_w_hu",
++                "xvmulwev_w_hu_h", "xvmulwod_w_hu_h"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v8i32_ty],
++                                        [llvm_v16i16_ty, llvm_v16i16_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvhaddw_d_w", "xvhaddw_du_wu", "xvhsubw_d_w", "xvhsubw_du_wu",
++                "xvaddwev_d_w", "xvaddwod_d_w", "xvsubwev_d_w", "xvsubwod_d_w",
++                "xvaddwev_d_wu", "xvaddwod_d_wu", "xvsubwev_d_wu", "xvsubwod_d_wu",
++                "xvaddwev_d_wu_w", "xvaddwod_d_wu_w",
++                "xvmulwev_d_w", "xvmulwod_d_w", "xvmulwev_d_wu", "xvmulwod_d_wu",
++                "xvmulwev_d_wu_w", "xvmulwod_d_wu_w"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v4i64_ty],
++                                        [llvm_v8i32_ty, llvm_v8i32_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvsrln_b_h", "xvsran_b_h", "xvsrlrn_b_h", "xvsrarn_b_h",
++                "xvssrln_b_h", "xvssran_b_h", "xvssrln_bu_h", "xvssran_bu_h",
++                "xvssrlrn_b_h", "xvssrarn_b_h", "xvssrlrn_bu_h", "xvssrarn_bu_h"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v32i8_ty],
++                                        [llvm_v16i16_ty, llvm_v16i16_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvsrln_h_w", "xvsran_h_w", "xvsrlrn_h_w", "xvsrarn_h_w",
++                "xvssrln_h_w", "xvssran_h_w", "xvssrln_hu_w", "xvssran_hu_w",
++                "xvssrlrn_h_w", "xvssrarn_h_w", "xvssrlrn_hu_w", "xvssrarn_hu_w"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v16i16_ty],
++                                        [llvm_v8i32_ty, llvm_v8i32_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvsrln_w_d", "xvsran_w_d", "xvsrlrn_w_d", "xvsrarn_w_d",
++                "xvssrln_w_d", "xvssran_w_d", "xvssrln_wu_d", "xvssran_wu_d",
++                "xvssrlrn_w_d", "xvssrarn_w_d", "xvssrlrn_wu_d", "xvssrarn_wu_d"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v8i32_ty],
++                                        [llvm_v4i64_ty, llvm_v4i64_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvmadd_b", "xvmsub_b", "xvfrstp_b", "xvbitsel_v", "xvshuf_b"] in
++  def int_loongarch_lasx_#inst
++    : VecInt<[llvm_v32i8_ty],
++             [llvm_v32i8_ty, llvm_v32i8_ty, llvm_v32i8_ty],
++             [IntrNoMem]>;
++foreach inst = ["xvmadd_h", "xvmsub_h", "xvfrstp_h", "xvshuf_h"] in
++  def int_loongarch_lasx_#inst
++    : VecInt<[llvm_v16i16_ty],
++             [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty],
++             [IntrNoMem]>;
++foreach inst = ["xvmadd_w", "xvmsub_w", "xvshuf_w"] in
++  def int_loongarch_lasx_#inst
++    : VecInt<[llvm_v8i32_ty],
++             [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
++             [IntrNoMem]>;
++foreach inst = ["xvmadd_d", "xvmsub_d", "xvshuf_d"] in
++  def int_loongarch_lasx_#inst
++    : VecInt<[llvm_v4i64_ty],
++             [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty],
++             [IntrNoMem]>;
++
++foreach inst = ["xvsrlni_b_h", "xvsrani_b_h", "xvsrlrni_b_h", "xvsrarni_b_h",
++                "xvssrlni_b_h", "xvssrani_b_h", "xvssrlni_bu_h", "xvssrani_bu_h",
++                "xvssrlrni_b_h", "xvssrarni_b_h", "xvssrlrni_bu_h", "xvssrarni_bu_h",
++                "xvfrstpi_b", "xvbitseli_b", "xvextrins_b", "xvpermi_q"] in
++  def int_loongarch_lasx_#inst
++    : VecInt<[llvm_v32i8_ty],
++             [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty],
++             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
++foreach inst = ["xvsrlni_h_w", "xvsrani_h_w", "xvsrlrni_h_w", "xvsrarni_h_w",
++                "xvssrlni_h_w", "xvssrani_h_w", "xvssrlni_hu_w", "xvssrani_hu_w",
++                "xvssrlrni_h_w", "xvssrarni_h_w", "xvssrlrni_hu_w", "xvssrarni_hu_w",
++                "xvfrstpi_h", "xvextrins_h"] in
++  def int_loongarch_lasx_#inst
++    : VecInt<[llvm_v16i16_ty],
++             [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty],
++             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
++foreach inst = ["xvsrlni_w_d", "xvsrani_w_d", "xvsrlrni_w_d", "xvsrarni_w_d",
++                "xvssrlni_w_d", "xvssrani_w_d", "xvssrlni_wu_d", "xvssrani_wu_d",
++                "xvssrlrni_w_d", "xvssrarni_w_d", "xvssrlrni_wu_d", "xvssrarni_wu_d",
++                "xvpermi_w", "xvextrins_w", "xvinsve0_w"] in
++  def int_loongarch_lasx_#inst
++    : VecInt<[llvm_v8i32_ty],
++             [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty],
++             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
++foreach inst = ["xvsrlni_d_q", "xvsrani_d_q", "xvsrlrni_d_q", "xvsrarni_d_q",
++                "xvssrlni_d_q", "xvssrani_d_q", "xvssrlni_du_q", "xvssrani_du_q",
++                "xvssrlrni_d_q", "xvssrarni_d_q", "xvssrlrni_du_q", "xvssrarni_du_q",
++                "xvshuf4i_d", "xvextrins_d", "xvinsve0_d"] in
++  def int_loongarch_lasx_#inst
++    : VecInt<[llvm_v4i64_ty],
++             [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty],
++             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
++
++foreach inst = ["xvmaddwev_h_b", "xvmaddwod_h_b", "xvmaddwev_h_bu",
++                "xvmaddwod_h_bu", "xvmaddwev_h_bu_b", "xvmaddwod_h_bu_b"] in
++  def int_loongarch_lasx_#inst
++    : VecInt<[llvm_v16i16_ty],
++             [llvm_v16i16_ty, llvm_v32i8_ty, llvm_v32i8_ty],
++             [IntrNoMem]>;
++foreach inst = ["xvmaddwev_w_h", "xvmaddwod_w_h", "xvmaddwev_w_hu",
++                "xvmaddwod_w_hu", "xvmaddwev_w_hu_h", "xvmaddwod_w_hu_h"] in
++  def int_loongarch_lasx_#inst
++    : VecInt<[llvm_v8i32_ty],
++             [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
++             [IntrNoMem]>;
++foreach inst = ["xvmaddwev_d_w", "xvmaddwod_d_w", "xvmaddwev_d_wu",
++                "xvmaddwod_d_wu", "xvmaddwev_d_wu_w", "xvmaddwod_d_wu_w"] in
++  def int_loongarch_lasx_#inst
++    : VecInt<[llvm_v4i64_ty],
++             [llvm_v4i64_ty, llvm_v8i32_ty, llvm_v8i32_ty],
++             [IntrNoMem]>;
++foreach inst = ["xvmaddwev_q_d", "xvmaddwod_q_d", "xvmaddwev_q_du",
++                "xvmaddwod_q_du", "xvmaddwev_q_du_d", "xvmaddwod_q_du_d"] in
++  def int_loongarch_lasx_#inst
++    : VecInt<[llvm_v4i64_ty],
++             [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty],
++             [IntrNoMem]>;
++
++foreach inst = ["xvsllwil_h_b", "xvsllwil_hu_bu"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v16i16_ty],
++                                        [llvm_v32i8_ty, llvm_i32_ty],
++                                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++foreach inst = ["xvsllwil_w_h", "xvsllwil_wu_hu"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v8i32_ty],
++                                        [llvm_v16i16_ty, llvm_i32_ty],
++                                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++foreach inst = ["xvsllwil_d_w", "xvsllwil_du_wu"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v4i64_ty],
++                                        [llvm_v8i32_ty, llvm_i32_ty],
++                                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++
++foreach inst = ["xvneg_b", "xvmskltz_b", "xvmskgez_b", "xvmsknz_b",
++                "xvclo_b", "xvclz_b", "xvpcnt_b",
++                "xvreplve0_b", "xvreplve0_q"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v32i8_ty], [llvm_v32i8_ty],
++                                        [IntrNoMem]>;
++foreach inst = ["xvneg_h", "xvmskltz_h", "xvclo_h", "xvclz_h", "xvpcnt_h",
++                "xvreplve0_h"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v16i16_ty], [llvm_v16i16_ty],
++                                        [IntrNoMem]>;
++foreach inst = ["xvneg_w", "xvmskltz_w", "xvclo_w", "xvclz_w", "xvpcnt_w",
++                "xvreplve0_w"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v8i32_ty], [llvm_v8i32_ty],
++                                        [IntrNoMem]>;
++foreach inst = ["xvneg_d", "xvexth_q_d", "xvexth_qu_du", "xvmskltz_d",
++                "xvextl_q_d", "xvextl_qu_du", "xvclo_d", "xvclz_d", "xvpcnt_d",
++                "xvreplve0_d"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvexth_h_b", "xvexth_hu_bu", "vext2xv_h_b", "vext2xv_hu_bu"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v16i16_ty], [llvm_v32i8_ty],
++                                        [IntrNoMem]>;
++foreach inst = ["xvexth_w_h", "xvexth_wu_hu", "vext2xv_w_h", "vext2xv_wu_hu"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v8i32_ty], [llvm_v16i16_ty],
++                                        [IntrNoMem]>;
++foreach inst = ["xvexth_d_w", "xvexth_du_wu", "vext2xv_d_w", "vext2xv_du_wu"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v4i64_ty], [llvm_v8i32_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["vext2xv_w_b", "vext2xv_wu_bu"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v8i32_ty], [llvm_v32i8_ty],
++                                        [IntrNoMem]>;
++foreach inst = ["vext2xv_d_h", "vext2xv_du_hu"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v4i64_ty], [llvm_v16i16_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["vext2xv_d_b", "vext2xv_du_bu"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v4i64_ty], [llvm_v32i8_ty],
++                                        [IntrNoMem]>;
++
++def int_loongarch_lasx_xvldi : VecInt<[llvm_v4i64_ty], [llvm_i32_ty],
++                                      [IntrNoMem, ImmArg<ArgIndex<0>>]>;
++def int_loongarch_lasx_xvrepli_b : VecInt<[llvm_v32i8_ty], [llvm_i32_ty],
++                                          [IntrNoMem, ImmArg<ArgIndex<0>>]>;
++def int_loongarch_lasx_xvrepli_h : VecInt<[llvm_v16i16_ty], [llvm_i32_ty],
++                                          [IntrNoMem, ImmArg<ArgIndex<0>>]>;
++def int_loongarch_lasx_xvrepli_w : VecInt<[llvm_v8i32_ty], [llvm_i32_ty],
++                                          [IntrNoMem, ImmArg<ArgIndex<0>>]>;
++def int_loongarch_lasx_xvrepli_d : VecInt<[llvm_v4i64_ty], [llvm_i32_ty],
++                                          [IntrNoMem, ImmArg<ArgIndex<0>>]>;
++
++def int_loongarch_lasx_xvreplgr2vr_b : VecInt<[llvm_v32i8_ty], [llvm_i32_ty],
++                                             [IntrNoMem]>;
++def int_loongarch_lasx_xvreplgr2vr_h : VecInt<[llvm_v16i16_ty], [llvm_i32_ty],
++                                             [IntrNoMem]>;
++def int_loongarch_lasx_xvreplgr2vr_w : VecInt<[llvm_v8i32_ty], [llvm_i32_ty],
++                                             [IntrNoMem]>;
++def int_loongarch_lasx_xvreplgr2vr_d : VecInt<[llvm_v4i64_ty], [llvm_i64_ty],
++                                             [IntrNoMem]>;
++
++def int_loongarch_lasx_xvinsgr2vr_w
++  : VecInt<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_i32_ty, llvm_i32_ty],
++           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
++def int_loongarch_lasx_xvinsgr2vr_d
++  : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_i64_ty, llvm_i32_ty],
++           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
++
++def int_loongarch_lasx_xvreplve_b
++  : VecInt<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
++def int_loongarch_lasx_xvreplve_h
++  : VecInt<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_i32_ty], [IntrNoMem]>;
++def int_loongarch_lasx_xvreplve_w
++  : VecInt<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_i32_ty], [IntrNoMem]>;
++def int_loongarch_lasx_xvreplve_d
++  : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_i32_ty], [IntrNoMem]>;
++
++foreach inst = ["xvpickve2gr_w", "xvpickve2gr_wu" ] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_i32_ty],
++                                        [llvm_v8i32_ty, llvm_i32_ty],
++                                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++foreach inst = ["xvpickve2gr_d", "xvpickve2gr_du" ] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_i64_ty],
++                                        [llvm_v4i64_ty, llvm_i32_ty],
++                                        [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++
++def int_loongarch_lasx_xbz_b : VecInt<[llvm_i32_ty], [llvm_v32i8_ty],
++                                      [IntrNoMem]>;
++def int_loongarch_lasx_xbz_h : VecInt<[llvm_i32_ty], [llvm_v16i16_ty],
++                                      [IntrNoMem]>;
++def int_loongarch_lasx_xbz_w : VecInt<[llvm_i32_ty], [llvm_v8i32_ty],
++                                      [IntrNoMem]>;
++def int_loongarch_lasx_xbz_d : VecInt<[llvm_i32_ty], [llvm_v4i64_ty],
++                                      [IntrNoMem]>;
++def int_loongarch_lasx_xbz_v : VecInt<[llvm_i32_ty], [llvm_v32i8_ty],
++                                      [IntrNoMem]>;
++
++def int_loongarch_lasx_xbnz_v : VecInt<[llvm_i32_ty], [llvm_v32i8_ty],
++                                       [IntrNoMem]>;
++def int_loongarch_lasx_xbnz_b : VecInt<[llvm_i32_ty], [llvm_v32i8_ty],
++                                       [IntrNoMem]>;
++def int_loongarch_lasx_xbnz_h : VecInt<[llvm_i32_ty], [llvm_v16i16_ty],
++                                       [IntrNoMem]>;
++def int_loongarch_lasx_xbnz_w : VecInt<[llvm_i32_ty], [llvm_v8i32_ty],
++                                       [IntrNoMem]>;
++def int_loongarch_lasx_xbnz_d : VecInt<[llvm_i32_ty], [llvm_v4i64_ty],
++                                       [IntrNoMem]>;
++
++// LASX Float
++
++foreach inst = ["xvfadd_s", "xvfsub_s", "xvfmul_s", "xvfdiv_s",
++                "xvfmax_s", "xvfmin_s", "xvfmaxa_s", "xvfmina_s"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v8f32_ty],
++                                        [llvm_v8f32_ty, llvm_v8f32_ty],
++                                        [IntrNoMem]>;
++foreach inst = ["xvfadd_d", "xvfsub_d", "xvfmul_d", "xvfdiv_d",
++                "xvfmax_d", "xvfmin_d", "xvfmaxa_d", "xvfmina_d"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v4f64_ty],
++                                        [llvm_v4f64_ty, llvm_v4f64_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvfmadd_s", "xvfmsub_s", "xvfnmadd_s", "xvfnmsub_s"] in
++  def int_loongarch_lasx_#inst
++    : VecInt<[llvm_v8f32_ty],
++             [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
++             [IntrNoMem]>;
++foreach inst = ["xvfmadd_d", "xvfmsub_d", "xvfnmadd_d", "xvfnmsub_d"] in
++  def int_loongarch_lasx_#inst
++    : VecInt<[llvm_v4f64_ty],
++             [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
++             [IntrNoMem]>;
++
++foreach inst = ["xvflogb_s", "xvfsqrt_s", "xvfrecip_s", "xvfrsqrt_s", "xvfrint_s",
++                "xvfrintrne_s", "xvfrintrz_s", "xvfrintrp_s", "xvfrintrm_s"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty],
++                                        [IntrNoMem]>;
++foreach inst = ["xvflogb_d", "xvfsqrt_d", "xvfrecip_d", "xvfrsqrt_d", "xvfrint_d",
++                "xvfrintrne_d", "xvfrintrz_d", "xvfrintrp_d", "xvfrintrm_d"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvfcvtl_s_h", "xvfcvth_s_h"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v8f32_ty], [llvm_v16i16_ty],
++                                        [IntrNoMem]>;
++foreach inst = ["xvfcvtl_d_s", "xvfcvth_d_s"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v4f64_ty], [llvm_v8f32_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvftintrne_w_s", "xvftintrz_w_s", "xvftintrp_w_s", "xvftintrm_w_s",
++                "xvftint_w_s", "xvftintrz_wu_s", "xvftint_wu_s", "xvfclass_s"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v8i32_ty], [llvm_v8f32_ty],
++                                        [IntrNoMem]>;
++foreach inst = ["xvftintrne_l_d", "xvftintrz_l_d", "xvftintrp_l_d", "xvftintrm_l_d",
++                "xvftint_l_d", "xvftintrz_lu_d", "xvftint_lu_d", "xvfclass_d"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v4i64_ty], [llvm_v4f64_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvftintrnel_l_s", "xvftintrneh_l_s", "xvftintrzl_l_s",
++                "xvftintrzh_l_s", "xvftintrpl_l_s", "xvftintrph_l_s",
++                "xvftintrml_l_s", "xvftintrmh_l_s", "xvftintl_l_s",
++                "xvftinth_l_s"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v4i64_ty], [llvm_v8f32_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvffint_s_w", "xvffint_s_wu"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v8f32_ty], [llvm_v8i32_ty],
++                                        [IntrNoMem]>;
++foreach inst = ["xvffint_d_l", "xvffint_d_lu"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v4f64_ty], [llvm_v4i64_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvffintl_d_w", "xvffinth_d_w"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v4f64_ty], [llvm_v8i32_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvffint_s_l"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v8f32_ty],
++                                        [llvm_v4i64_ty, llvm_v4i64_ty],
++                                        [IntrNoMem]>;
++foreach inst = ["xvftintrne_w_d", "xvftintrz_w_d", "xvftintrp_w_d", "xvftintrm_w_d",
++                "xvftint_w_d"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v8i32_ty],
++                                        [llvm_v4f64_ty, llvm_v4f64_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvfcvt_h_s"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v16i16_ty],
++                                        [llvm_v8f32_ty, llvm_v8f32_ty],
++                                        [IntrNoMem]>;
++foreach inst = ["xvfcvt_s_d"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v8f32_ty],
++                                        [llvm_v4f64_ty, llvm_v4f64_ty],
++                                        [IntrNoMem]>;
++
++foreach inst = ["xvfcmp_caf_s", "xvfcmp_cun_s", "xvfcmp_ceq_s", "xvfcmp_cueq_s",
++                "xvfcmp_clt_s", "xvfcmp_cult_s", "xvfcmp_cle_s", "xvfcmp_cule_s",
++                "xvfcmp_cne_s", "xvfcmp_cor_s", "xvfcmp_cune_s",
++                "xvfcmp_saf_s", "xvfcmp_sun_s", "xvfcmp_seq_s", "xvfcmp_sueq_s",
++                "xvfcmp_slt_s", "xvfcmp_sult_s", "xvfcmp_sle_s", "xvfcmp_sule_s",
++                "xvfcmp_sne_s", "xvfcmp_sor_s", "xvfcmp_sune_s"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v8i32_ty],
++                                        [llvm_v8f32_ty, llvm_v8f32_ty],
++                                        [IntrNoMem]>;
++foreach inst = ["xvfcmp_caf_d", "xvfcmp_cun_d", "xvfcmp_ceq_d", "xvfcmp_cueq_d",
++                "xvfcmp_clt_d", "xvfcmp_cult_d", "xvfcmp_cle_d", "xvfcmp_cule_d",
++                "xvfcmp_cne_d", "xvfcmp_cor_d", "xvfcmp_cune_d",
++                "xvfcmp_saf_d", "xvfcmp_sun_d", "xvfcmp_seq_d", "xvfcmp_sueq_d",
++                "xvfcmp_slt_d", "xvfcmp_sult_d", "xvfcmp_sle_d", "xvfcmp_sule_d",
++                "xvfcmp_sne_d", "xvfcmp_sor_d", "xvfcmp_sune_d"] in
++  def int_loongarch_lasx_#inst : VecInt<[llvm_v4i64_ty],
++                                        [llvm_v4f64_ty, llvm_v4f64_ty],
++                                        [IntrNoMem]>;
++
++def int_loongarch_lasx_xvpickve_w_f
++  : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty],
++           [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++def int_loongarch_lasx_xvpickve_d_f
++  : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty],
++           [IntrNoMem, ImmArg<ArgIndex<1>>]>;
++
++// LASX load/store
++def int_loongarch_lasx_xvld
++  : VecInt<[llvm_v32i8_ty], [llvm_ptr_ty, llvm_i32_ty],
++           [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>]>;
++def int_loongarch_lasx_xvldx
++  : VecInt<[llvm_v32i8_ty], [llvm_ptr_ty, llvm_i64_ty],
++           [IntrReadMem, IntrArgMemOnly]>;
++def int_loongarch_lasx_xvldrepl_b
++  : VecInt<[llvm_v32i8_ty], [llvm_ptr_ty, llvm_i32_ty],
++           [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>]>;
++def int_loongarch_lasx_xvldrepl_h
++  : VecInt<[llvm_v16i16_ty], [llvm_ptr_ty, llvm_i32_ty],
++           [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>]>;
++def int_loongarch_lasx_xvldrepl_w
++  : VecInt<[llvm_v8i32_ty], [llvm_ptr_ty, llvm_i32_ty],
++           [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>]>;
++def int_loongarch_lasx_xvldrepl_d
++  : VecInt<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_i32_ty],
++           [IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>]>;
++
++def int_loongarch_lasx_xvst
++  : VecInt<[], [llvm_v32i8_ty, llvm_ptr_ty, llvm_i32_ty],
++           [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>]>;
++def int_loongarch_lasx_xvstx
++  : VecInt<[], [llvm_v32i8_ty, llvm_ptr_ty, llvm_i64_ty],
++           [IntrWriteMem, IntrArgMemOnly]>;
++def int_loongarch_lasx_xvstelm_b
++  : VecInt<[], [llvm_v32i8_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
++           [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
++def int_loongarch_lasx_xvstelm_h
++  : VecInt<[], [llvm_v16i16_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
++           [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
++def int_loongarch_lasx_xvstelm_w
++  : VecInt<[], [llvm_v8i32_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
++           [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
++def int_loongarch_lasx_xvstelm_d
++  : VecInt<[], [llvm_v4i64_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
++           [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
++} // TargetPrefix = "loongarch"
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index c05133647929..3a40cd06a3eb 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -64,11 +64,17 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+ 
+   static const MVT::SimpleValueType LSXVTs[] = {
+       MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v4f32, MVT::v2f64};
++  static const MVT::SimpleValueType LASXVTs[] = {
++      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, MVT::v8f32, MVT::v4f64};
+ 
+   if (Subtarget.hasExtLSX())
+     for (MVT VT : LSXVTs)
+       addRegisterClass(VT, &LoongArch::LSX128RegClass);
+ 
++  if (Subtarget.hasExtLASX())
++    for (MVT VT : LASXVTs)
++      addRegisterClass(VT, &LoongArch::LASX256RegClass);
++
+   setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, GRLenVT,
+                    MVT::i1, Promote);
+ 
+@@ -207,6 +213,11 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+     setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN},
+                        {MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8}, Legal);
+ 
++  if (Subtarget.hasExtLASX())
++    setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN},
++                       {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8},
++                       Legal);
++
+   // Compute derived properties from the register classes.
+   computeRegisterProperties(Subtarget.getRegisterInfo());
+ 
+@@ -695,9 +706,17 @@ LoongArchTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+   case Intrinsic::loongarch_lsx_vpickve2gr_d:
+   case Intrinsic::loongarch_lsx_vpickve2gr_du:
+   case Intrinsic::loongarch_lsx_vreplvei_d:
++  case Intrinsic::loongarch_lasx_xvrepl128vei_d:
+     return checkIntrinsicImmArg<1>(Op, 2, DAG);
+   case Intrinsic::loongarch_lsx_vreplvei_w:
++  case Intrinsic::loongarch_lasx_xvrepl128vei_w:
++  case Intrinsic::loongarch_lasx_xvpickve2gr_d:
++  case Intrinsic::loongarch_lasx_xvpickve2gr_du:
++  case Intrinsic::loongarch_lasx_xvpickve_d:
++  case Intrinsic::loongarch_lasx_xvpickve_d_f:
+     return checkIntrinsicImmArg<2>(Op, 2, DAG);
++  case Intrinsic::loongarch_lasx_xvinsve0_d:
++    return checkIntrinsicImmArg<2>(Op, 3, DAG);
+   case Intrinsic::loongarch_lsx_vsat_b:
+   case Intrinsic::loongarch_lsx_vsat_bu:
+   case Intrinsic::loongarch_lsx_vrotri_b:
+@@ -706,7 +725,19 @@ LoongArchTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+   case Intrinsic::loongarch_lsx_vsrlri_b:
+   case Intrinsic::loongarch_lsx_vsrari_b:
+   case Intrinsic::loongarch_lsx_vreplvei_h:
++  case Intrinsic::loongarch_lasx_xvsat_b:
++  case Intrinsic::loongarch_lasx_xvsat_bu:
++  case Intrinsic::loongarch_lasx_xvrotri_b:
++  case Intrinsic::loongarch_lasx_xvsllwil_h_b:
++  case Intrinsic::loongarch_lasx_xvsllwil_hu_bu:
++  case Intrinsic::loongarch_lasx_xvsrlri_b:
++  case Intrinsic::loongarch_lasx_xvsrari_b:
++  case Intrinsic::loongarch_lasx_xvrepl128vei_h:
++  case Intrinsic::loongarch_lasx_xvpickve_w:
++  case Intrinsic::loongarch_lasx_xvpickve_w_f:
+     return checkIntrinsicImmArg<3>(Op, 2, DAG);
++  case Intrinsic::loongarch_lasx_xvinsve0_w:
++    return checkIntrinsicImmArg<3>(Op, 3, DAG);
+   case Intrinsic::loongarch_lsx_vsat_h:
+   case Intrinsic::loongarch_lsx_vsat_hu:
+   case Intrinsic::loongarch_lsx_vrotri_h:
+@@ -715,6 +746,14 @@ LoongArchTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+   case Intrinsic::loongarch_lsx_vsrlri_h:
+   case Intrinsic::loongarch_lsx_vsrari_h:
+   case Intrinsic::loongarch_lsx_vreplvei_b:
++  case Intrinsic::loongarch_lasx_xvsat_h:
++  case Intrinsic::loongarch_lasx_xvsat_hu:
++  case Intrinsic::loongarch_lasx_xvrotri_h:
++  case Intrinsic::loongarch_lasx_xvsllwil_w_h:
++  case Intrinsic::loongarch_lasx_xvsllwil_wu_hu:
++  case Intrinsic::loongarch_lasx_xvsrlri_h:
++  case Intrinsic::loongarch_lasx_xvsrari_h:
++  case Intrinsic::loongarch_lasx_xvrepl128vei_b:
+     return checkIntrinsicImmArg<4>(Op, 2, DAG);
+   case Intrinsic::loongarch_lsx_vsrlni_b_h:
+   case Intrinsic::loongarch_lsx_vsrani_b_h:
+@@ -728,6 +767,18 @@ LoongArchTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+   case Intrinsic::loongarch_lsx_vssrarni_b_h:
+   case Intrinsic::loongarch_lsx_vssrlrni_bu_h:
+   case Intrinsic::loongarch_lsx_vssrarni_bu_h:
++  case Intrinsic::loongarch_lasx_xvsrlni_b_h:
++  case Intrinsic::loongarch_lasx_xvsrani_b_h:
++  case Intrinsic::loongarch_lasx_xvsrlrni_b_h:
++  case Intrinsic::loongarch_lasx_xvsrarni_b_h:
++  case Intrinsic::loongarch_lasx_xvssrlni_b_h:
++  case Intrinsic::loongarch_lasx_xvssrani_b_h:
++  case Intrinsic::loongarch_lasx_xvssrlni_bu_h:
++  case Intrinsic::loongarch_lasx_xvssrani_bu_h:
++  case Intrinsic::loongarch_lasx_xvssrlrni_b_h:
++  case Intrinsic::loongarch_lasx_xvssrarni_b_h:
++  case Intrinsic::loongarch_lasx_xvssrlrni_bu_h:
++  case Intrinsic::loongarch_lasx_xvssrarni_bu_h:
+     return checkIntrinsicImmArg<4>(Op, 3, DAG);
+   case Intrinsic::loongarch_lsx_vsat_w:
+   case Intrinsic::loongarch_lsx_vsat_wu:
+@@ -746,6 +797,23 @@ LoongArchTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+   case Intrinsic::loongarch_lsx_vslti_du:
+   case Intrinsic::loongarch_lsx_vbsll_v:
+   case Intrinsic::loongarch_lsx_vbsrl_v:
++  case Intrinsic::loongarch_lasx_xvsat_w:
++  case Intrinsic::loongarch_lasx_xvsat_wu:
++  case Intrinsic::loongarch_lasx_xvrotri_w:
++  case Intrinsic::loongarch_lasx_xvsllwil_d_w:
++  case Intrinsic::loongarch_lasx_xvsllwil_du_wu:
++  case Intrinsic::loongarch_lasx_xvsrlri_w:
++  case Intrinsic::loongarch_lasx_xvsrari_w:
++  case Intrinsic::loongarch_lasx_xvslei_bu:
++  case Intrinsic::loongarch_lasx_xvslei_hu:
++  case Intrinsic::loongarch_lasx_xvslei_wu:
++  case Intrinsic::loongarch_lasx_xvslei_du:
++  case Intrinsic::loongarch_lasx_xvslti_bu:
++  case Intrinsic::loongarch_lasx_xvslti_hu:
++  case Intrinsic::loongarch_lasx_xvslti_wu:
++  case Intrinsic::loongarch_lasx_xvslti_du:
++  case Intrinsic::loongarch_lasx_xvbsll_v:
++  case Intrinsic::loongarch_lasx_xvbsrl_v:
+     return checkIntrinsicImmArg<5>(Op, 2, DAG);
+   case Intrinsic::loongarch_lsx_vseqi_b:
+   case Intrinsic::loongarch_lsx_vseqi_h:
+@@ -759,6 +827,18 @@ LoongArchTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+   case Intrinsic::loongarch_lsx_vslti_h:
+   case Intrinsic::loongarch_lsx_vslti_w:
+   case Intrinsic::loongarch_lsx_vslti_d:
++  case Intrinsic::loongarch_lasx_xvseqi_b:
++  case Intrinsic::loongarch_lasx_xvseqi_h:
++  case Intrinsic::loongarch_lasx_xvseqi_w:
++  case Intrinsic::loongarch_lasx_xvseqi_d:
++  case Intrinsic::loongarch_lasx_xvslei_b:
++  case Intrinsic::loongarch_lasx_xvslei_h:
++  case Intrinsic::loongarch_lasx_xvslei_w:
++  case Intrinsic::loongarch_lasx_xvslei_d:
++  case Intrinsic::loongarch_lasx_xvslti_b:
++  case Intrinsic::loongarch_lasx_xvslti_h:
++  case Intrinsic::loongarch_lasx_xvslti_w:
++  case Intrinsic::loongarch_lasx_xvslti_d:
+     return checkIntrinsicImmArg<5>(Op, 2, DAG, /*IsSigned=*/true);
+   case Intrinsic::loongarch_lsx_vsrlni_h_w:
+   case Intrinsic::loongarch_lsx_vsrani_h_w:
+@@ -774,12 +854,31 @@ LoongArchTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+   case Intrinsic::loongarch_lsx_vssrarni_hu_w:
+   case Intrinsic::loongarch_lsx_vfrstpi_b:
+   case Intrinsic::loongarch_lsx_vfrstpi_h:
++  case Intrinsic::loongarch_lasx_xvsrlni_h_w:
++  case Intrinsic::loongarch_lasx_xvsrani_h_w:
++  case Intrinsic::loongarch_lasx_xvsrlrni_h_w:
++  case Intrinsic::loongarch_lasx_xvsrarni_h_w:
++  case Intrinsic::loongarch_lasx_xvssrlni_h_w:
++  case Intrinsic::loongarch_lasx_xvssrani_h_w:
++  case Intrinsic::loongarch_lasx_xvssrlni_hu_w:
++  case Intrinsic::loongarch_lasx_xvssrani_hu_w:
++  case Intrinsic::loongarch_lasx_xvssrlrni_h_w:
++  case Intrinsic::loongarch_lasx_xvssrarni_h_w:
++  case Intrinsic::loongarch_lasx_xvssrlrni_hu_w:
++  case Intrinsic::loongarch_lasx_xvssrarni_hu_w:
++  case Intrinsic::loongarch_lasx_xvfrstpi_b:
++  case Intrinsic::loongarch_lasx_xvfrstpi_h:
+     return checkIntrinsicImmArg<5>(Op, 3, DAG);
+   case Intrinsic::loongarch_lsx_vsat_d:
+   case Intrinsic::loongarch_lsx_vsat_du:
+   case Intrinsic::loongarch_lsx_vrotri_d:
+   case Intrinsic::loongarch_lsx_vsrlri_d:
+   case Intrinsic::loongarch_lsx_vsrari_d:
++  case Intrinsic::loongarch_lasx_xvsat_d:
++  case Intrinsic::loongarch_lasx_xvsat_du:
++  case Intrinsic::loongarch_lasx_xvrotri_d:
++  case Intrinsic::loongarch_lasx_xvsrlri_d:
++  case Intrinsic::loongarch_lasx_xvsrari_d:
+     return checkIntrinsicImmArg<6>(Op, 2, DAG);
+   case Intrinsic::loongarch_lsx_vsrlni_w_d:
+   case Intrinsic::loongarch_lsx_vsrani_w_d:
+@@ -793,6 +892,18 @@ LoongArchTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+   case Intrinsic::loongarch_lsx_vssrarni_w_d:
+   case Intrinsic::loongarch_lsx_vssrlrni_wu_d:
+   case Intrinsic::loongarch_lsx_vssrarni_wu_d:
++  case Intrinsic::loongarch_lasx_xvsrlni_w_d:
++  case Intrinsic::loongarch_lasx_xvsrani_w_d:
++  case Intrinsic::loongarch_lasx_xvsrlrni_w_d:
++  case Intrinsic::loongarch_lasx_xvsrarni_w_d:
++  case Intrinsic::loongarch_lasx_xvssrlni_w_d:
++  case Intrinsic::loongarch_lasx_xvssrani_w_d:
++  case Intrinsic::loongarch_lasx_xvssrlni_wu_d:
++  case Intrinsic::loongarch_lasx_xvssrani_wu_d:
++  case Intrinsic::loongarch_lasx_xvssrlrni_w_d:
++  case Intrinsic::loongarch_lasx_xvssrarni_w_d:
++  case Intrinsic::loongarch_lasx_xvssrlrni_wu_d:
++  case Intrinsic::loongarch_lasx_xvssrarni_wu_d:
+     return checkIntrinsicImmArg<6>(Op, 3, DAG);
+   case Intrinsic::loongarch_lsx_vsrlni_d_q:
+   case Intrinsic::loongarch_lsx_vsrani_d_q:
+@@ -806,11 +917,28 @@ LoongArchTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+   case Intrinsic::loongarch_lsx_vssrarni_d_q:
+   case Intrinsic::loongarch_lsx_vssrlrni_du_q:
+   case Intrinsic::loongarch_lsx_vssrarni_du_q:
++  case Intrinsic::loongarch_lasx_xvsrlni_d_q:
++  case Intrinsic::loongarch_lasx_xvsrani_d_q:
++  case Intrinsic::loongarch_lasx_xvsrlrni_d_q:
++  case Intrinsic::loongarch_lasx_xvsrarni_d_q:
++  case Intrinsic::loongarch_lasx_xvssrlni_d_q:
++  case Intrinsic::loongarch_lasx_xvssrani_d_q:
++  case Intrinsic::loongarch_lasx_xvssrlni_du_q:
++  case Intrinsic::loongarch_lasx_xvssrani_du_q:
++  case Intrinsic::loongarch_lasx_xvssrlrni_d_q:
++  case Intrinsic::loongarch_lasx_xvssrarni_d_q:
++  case Intrinsic::loongarch_lasx_xvssrlrni_du_q:
++  case Intrinsic::loongarch_lasx_xvssrarni_du_q:
+     return checkIntrinsicImmArg<7>(Op, 3, DAG);
+   case Intrinsic::loongarch_lsx_vnori_b:
+   case Intrinsic::loongarch_lsx_vshuf4i_b:
+   case Intrinsic::loongarch_lsx_vshuf4i_h:
+   case Intrinsic::loongarch_lsx_vshuf4i_w:
++  case Intrinsic::loongarch_lasx_xvnori_b:
++  case Intrinsic::loongarch_lasx_xvshuf4i_b:
++  case Intrinsic::loongarch_lasx_xvshuf4i_h:
++  case Intrinsic::loongarch_lasx_xvshuf4i_w:
++  case Intrinsic::loongarch_lasx_xvpermi_d:
+     return checkIntrinsicImmArg<8>(Op, 2, DAG);
+   case Intrinsic::loongarch_lsx_vshuf4i_d:
+   case Intrinsic::loongarch_lsx_vpermi_w:
+@@ -819,13 +947,26 @@ LoongArchTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+   case Intrinsic::loongarch_lsx_vextrins_h:
+   case Intrinsic::loongarch_lsx_vextrins_w:
+   case Intrinsic::loongarch_lsx_vextrins_d:
++  case Intrinsic::loongarch_lasx_xvshuf4i_d:
++  case Intrinsic::loongarch_lasx_xvpermi_w:
++  case Intrinsic::loongarch_lasx_xvpermi_q:
++  case Intrinsic::loongarch_lasx_xvbitseli_b:
++  case Intrinsic::loongarch_lasx_xvextrins_b:
++  case Intrinsic::loongarch_lasx_xvextrins_h:
++  case Intrinsic::loongarch_lasx_xvextrins_w:
++  case Intrinsic::loongarch_lasx_xvextrins_d:
+     return checkIntrinsicImmArg<8>(Op, 3, DAG);
+   case Intrinsic::loongarch_lsx_vrepli_b:
+   case Intrinsic::loongarch_lsx_vrepli_h:
+   case Intrinsic::loongarch_lsx_vrepli_w:
+   case Intrinsic::loongarch_lsx_vrepli_d:
++  case Intrinsic::loongarch_lasx_xvrepli_b:
++  case Intrinsic::loongarch_lasx_xvrepli_h:
++  case Intrinsic::loongarch_lasx_xvrepli_w:
++  case Intrinsic::loongarch_lasx_xvrepli_d:
+     return checkIntrinsicImmArg<10>(Op, 1, DAG, /*IsSigned=*/true);
+   case Intrinsic::loongarch_lsx_vldi:
++  case Intrinsic::loongarch_lasx_xvldi:
+     return checkIntrinsicImmArg<13>(Op, 1, DAG, /*IsSigned=*/true);
+   }
+ }
+@@ -924,22 +1065,27 @@ LoongArchTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
+   }
+   case Intrinsic::loongarch_lsx_vld:
+   case Intrinsic::loongarch_lsx_vldrepl_b:
++  case Intrinsic::loongarch_lasx_xvld:
++  case Intrinsic::loongarch_lasx_xvldrepl_b:
+     return !isInt<12>(cast<ConstantSDNode>(Op.getOperand(3))->getSExtValue())
+                ? emitIntrinsicWithChainErrorMessage(Op, ErrorMsgOOR, DAG)
+                : SDValue();
+   case Intrinsic::loongarch_lsx_vldrepl_h:
++  case Intrinsic::loongarch_lasx_xvldrepl_h:
+     return !isShiftedInt<11, 1>(
+                cast<ConstantSDNode>(Op.getOperand(3))->getSExtValue())
+                ? emitIntrinsicWithChainErrorMessage(
+                      Op, "argument out of range or not a multiple of 2", DAG)
+                : SDValue();
+   case Intrinsic::loongarch_lsx_vldrepl_w:
++  case Intrinsic::loongarch_lasx_xvldrepl_w:
+     return !isShiftedInt<10, 2>(
+                cast<ConstantSDNode>(Op.getOperand(3))->getSExtValue())
+                ? emitIntrinsicWithChainErrorMessage(
+                      Op, "argument out of range or not a multiple of 4", DAG)
+                : SDValue();
+   case Intrinsic::loongarch_lsx_vldrepl_d:
++  case Intrinsic::loongarch_lasx_xvldrepl_d:
+     return !isShiftedInt<9, 3>(
+                cast<ConstantSDNode>(Op.getOperand(3))->getSExtValue())
+                ? emitIntrinsicWithChainErrorMessage(
+@@ -1064,14 +1210,27 @@ SDValue LoongArchTargetLowering::lowerINTRINSIC_VOID(SDValue Op,
+                              : Op;
+   }
+   case Intrinsic::loongarch_lsx_vst:
++  case Intrinsic::loongarch_lasx_xvst:
+     return !isInt<12>(cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue())
+                ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)
+                : SDValue();
++  case Intrinsic::loongarch_lasx_xvstelm_b:
++    return (!isInt<8>(cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
++            !isUInt<5>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
++               ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)
++               : SDValue();
+   case Intrinsic::loongarch_lsx_vstelm_b:
+     return (!isInt<8>(cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
+             !isUInt<4>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
+                ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)
+                : SDValue();
++  case Intrinsic::loongarch_lasx_xvstelm_h:
++    return (!isShiftedInt<8, 1>(
++                cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
++            !isUInt<4>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
++               ? emitIntrinsicErrorMessage(
++                     Op, "argument out of range or not a multiple of 2", DAG)
++               : SDValue();
+   case Intrinsic::loongarch_lsx_vstelm_h:
+     return (!isShiftedInt<8, 1>(
+                 cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
+@@ -1079,6 +1238,13 @@ SDValue LoongArchTargetLowering::lowerINTRINSIC_VOID(SDValue Op,
+                ? emitIntrinsicErrorMessage(
+                      Op, "argument out of range or not a multiple of 2", DAG)
+                : SDValue();
++  case Intrinsic::loongarch_lasx_xvstelm_w:
++    return (!isShiftedInt<8, 2>(
++                cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
++            !isUInt<3>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
++               ? emitIntrinsicErrorMessage(
++                     Op, "argument out of range or not a multiple of 4", DAG)
++               : SDValue();
+   case Intrinsic::loongarch_lsx_vstelm_w:
+     return (!isShiftedInt<8, 2>(
+                 cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
+@@ -1086,6 +1252,13 @@ SDValue LoongArchTargetLowering::lowerINTRINSIC_VOID(SDValue Op,
+                ? emitIntrinsicErrorMessage(
+                      Op, "argument out of range or not a multiple of 4", DAG)
+                : SDValue();
++  case Intrinsic::loongarch_lasx_xvstelm_d:
++    return (!isShiftedInt<8, 3>(
++                cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
++            !isUInt<2>(cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue()))
++               ? emitIntrinsicErrorMessage(
++                     Op, "argument out of range or not a multiple of 8", DAG)
++               : SDValue();
+   case Intrinsic::loongarch_lsx_vstelm_d:
+     return (!isShiftedInt<8, 3>(
+                 cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue()) ||
+@@ -1304,6 +1477,7 @@ replaceINTRINSIC_WO_CHAINResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                 LoongArchISD::VPICK_SEXT_ELT);
+     break;
+   case Intrinsic::loongarch_lsx_vpickve2gr_h:
++  case Intrinsic::loongarch_lasx_xvpickve2gr_w:
+     replaceVPICKVE2GRResults<3>(N, Results, DAG, Subtarget,
+                                 LoongArchISD::VPICK_SEXT_ELT);
+     break;
+@@ -1316,6 +1490,7 @@ replaceINTRINSIC_WO_CHAINResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                 LoongArchISD::VPICK_ZEXT_ELT);
+     break;
+   case Intrinsic::loongarch_lsx_vpickve2gr_hu:
++  case Intrinsic::loongarch_lasx_xvpickve2gr_wu:
+     replaceVPICKVE2GRResults<3>(N, Results, DAG, Subtarget,
+                                 LoongArchISD::VPICK_ZEXT_ELT);
+     break;
+@@ -1327,10 +1502,15 @@ replaceINTRINSIC_WO_CHAINResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+   case Intrinsic::loongarch_lsx_bz_h:
+   case Intrinsic::loongarch_lsx_bz_w:
+   case Intrinsic::loongarch_lsx_bz_d:
++  case Intrinsic::loongarch_lasx_xbz_b:
++  case Intrinsic::loongarch_lasx_xbz_h:
++  case Intrinsic::loongarch_lasx_xbz_w:
++  case Intrinsic::loongarch_lasx_xbz_d:
+     replaceVecCondBranchResults(N, Results, DAG, Subtarget,
+                                 LoongArchISD::VALL_ZERO);
+     break;
+   case Intrinsic::loongarch_lsx_bz_v:
++  case Intrinsic::loongarch_lasx_xbz_v:
+     replaceVecCondBranchResults(N, Results, DAG, Subtarget,
+                                 LoongArchISD::VANY_ZERO);
+     break;
+@@ -1338,10 +1518,15 @@ replaceINTRINSIC_WO_CHAINResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+   case Intrinsic::loongarch_lsx_bnz_h:
+   case Intrinsic::loongarch_lsx_bnz_w:
+   case Intrinsic::loongarch_lsx_bnz_d:
++  case Intrinsic::loongarch_lasx_xbnz_b:
++  case Intrinsic::loongarch_lasx_xbnz_h:
++  case Intrinsic::loongarch_lasx_xbnz_w:
++  case Intrinsic::loongarch_lasx_xbnz_d:
+     replaceVecCondBranchResults(N, Results, DAG, Subtarget,
+                                 LoongArchISD::VALL_NONZERO);
+     break;
+   case Intrinsic::loongarch_lsx_bnz_v:
++  case Intrinsic::loongarch_lasx_xbnz_v:
+     replaceVecCondBranchResults(N, Results, DAG, Subtarget,
+                                 LoongArchISD::VANY_NONZERO);
+     break;
+@@ -2114,30 +2299,50 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
+   case Intrinsic::loongarch_lsx_vadd_h:
+   case Intrinsic::loongarch_lsx_vadd_w:
+   case Intrinsic::loongarch_lsx_vadd_d:
++  case Intrinsic::loongarch_lasx_xvadd_b:
++  case Intrinsic::loongarch_lasx_xvadd_h:
++  case Intrinsic::loongarch_lasx_xvadd_w:
++  case Intrinsic::loongarch_lasx_xvadd_d:
+     return DAG.getNode(ISD::ADD, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+   case Intrinsic::loongarch_lsx_vaddi_bu:
+   case Intrinsic::loongarch_lsx_vaddi_hu:
+   case Intrinsic::loongarch_lsx_vaddi_wu:
+   case Intrinsic::loongarch_lsx_vaddi_du:
++  case Intrinsic::loongarch_lasx_xvaddi_bu:
++  case Intrinsic::loongarch_lasx_xvaddi_hu:
++  case Intrinsic::loongarch_lasx_xvaddi_wu:
++  case Intrinsic::loongarch_lasx_xvaddi_du:
+     return DAG.getNode(ISD::ADD, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<5>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vsub_b:
+   case Intrinsic::loongarch_lsx_vsub_h:
+   case Intrinsic::loongarch_lsx_vsub_w:
+   case Intrinsic::loongarch_lsx_vsub_d:
++  case Intrinsic::loongarch_lasx_xvsub_b:
++  case Intrinsic::loongarch_lasx_xvsub_h:
++  case Intrinsic::loongarch_lasx_xvsub_w:
++  case Intrinsic::loongarch_lasx_xvsub_d:
+     return DAG.getNode(ISD::SUB, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+   case Intrinsic::loongarch_lsx_vsubi_bu:
+   case Intrinsic::loongarch_lsx_vsubi_hu:
+   case Intrinsic::loongarch_lsx_vsubi_wu:
+   case Intrinsic::loongarch_lsx_vsubi_du:
++  case Intrinsic::loongarch_lasx_xvsubi_bu:
++  case Intrinsic::loongarch_lasx_xvsubi_hu:
++  case Intrinsic::loongarch_lasx_xvsubi_wu:
++  case Intrinsic::loongarch_lasx_xvsubi_du:
+     return DAG.getNode(ISD::SUB, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<5>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vneg_b:
+   case Intrinsic::loongarch_lsx_vneg_h:
+   case Intrinsic::loongarch_lsx_vneg_w:
+   case Intrinsic::loongarch_lsx_vneg_d:
++  case Intrinsic::loongarch_lasx_xvneg_b:
++  case Intrinsic::loongarch_lasx_xvneg_h:
++  case Intrinsic::loongarch_lasx_xvneg_w:
++  case Intrinsic::loongarch_lasx_xvneg_d:
+     return DAG.getNode(
+         ISD::SUB, DL, N->getValueType(0),
+         DAG.getConstant(
+@@ -2149,60 +2354,100 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
+   case Intrinsic::loongarch_lsx_vmax_h:
+   case Intrinsic::loongarch_lsx_vmax_w:
+   case Intrinsic::loongarch_lsx_vmax_d:
++  case Intrinsic::loongarch_lasx_xvmax_b:
++  case Intrinsic::loongarch_lasx_xvmax_h:
++  case Intrinsic::loongarch_lasx_xvmax_w:
++  case Intrinsic::loongarch_lasx_xvmax_d:
+     return DAG.getNode(ISD::SMAX, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+   case Intrinsic::loongarch_lsx_vmax_bu:
+   case Intrinsic::loongarch_lsx_vmax_hu:
+   case Intrinsic::loongarch_lsx_vmax_wu:
+   case Intrinsic::loongarch_lsx_vmax_du:
++  case Intrinsic::loongarch_lasx_xvmax_bu:
++  case Intrinsic::loongarch_lasx_xvmax_hu:
++  case Intrinsic::loongarch_lasx_xvmax_wu:
++  case Intrinsic::loongarch_lasx_xvmax_du:
+     return DAG.getNode(ISD::UMAX, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+   case Intrinsic::loongarch_lsx_vmaxi_b:
+   case Intrinsic::loongarch_lsx_vmaxi_h:
+   case Intrinsic::loongarch_lsx_vmaxi_w:
+   case Intrinsic::loongarch_lsx_vmaxi_d:
++  case Intrinsic::loongarch_lasx_xvmaxi_b:
++  case Intrinsic::loongarch_lasx_xvmaxi_h:
++  case Intrinsic::loongarch_lasx_xvmaxi_w:
++  case Intrinsic::loongarch_lasx_xvmaxi_d:
+     return DAG.getNode(ISD::SMAX, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<5>(N, 2, DAG, /*IsSigned=*/true));
+   case Intrinsic::loongarch_lsx_vmaxi_bu:
+   case Intrinsic::loongarch_lsx_vmaxi_hu:
+   case Intrinsic::loongarch_lsx_vmaxi_wu:
+   case Intrinsic::loongarch_lsx_vmaxi_du:
++  case Intrinsic::loongarch_lasx_xvmaxi_bu:
++  case Intrinsic::loongarch_lasx_xvmaxi_hu:
++  case Intrinsic::loongarch_lasx_xvmaxi_wu:
++  case Intrinsic::loongarch_lasx_xvmaxi_du:
+     return DAG.getNode(ISD::UMAX, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<5>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vmin_b:
+   case Intrinsic::loongarch_lsx_vmin_h:
+   case Intrinsic::loongarch_lsx_vmin_w:
+   case Intrinsic::loongarch_lsx_vmin_d:
++  case Intrinsic::loongarch_lasx_xvmin_b:
++  case Intrinsic::loongarch_lasx_xvmin_h:
++  case Intrinsic::loongarch_lasx_xvmin_w:
++  case Intrinsic::loongarch_lasx_xvmin_d:
+     return DAG.getNode(ISD::SMIN, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+   case Intrinsic::loongarch_lsx_vmin_bu:
+   case Intrinsic::loongarch_lsx_vmin_hu:
+   case Intrinsic::loongarch_lsx_vmin_wu:
+   case Intrinsic::loongarch_lsx_vmin_du:
++  case Intrinsic::loongarch_lasx_xvmin_bu:
++  case Intrinsic::loongarch_lasx_xvmin_hu:
++  case Intrinsic::loongarch_lasx_xvmin_wu:
++  case Intrinsic::loongarch_lasx_xvmin_du:
+     return DAG.getNode(ISD::UMIN, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+   case Intrinsic::loongarch_lsx_vmini_b:
+   case Intrinsic::loongarch_lsx_vmini_h:
+   case Intrinsic::loongarch_lsx_vmini_w:
+   case Intrinsic::loongarch_lsx_vmini_d:
++  case Intrinsic::loongarch_lasx_xvmini_b:
++  case Intrinsic::loongarch_lasx_xvmini_h:
++  case Intrinsic::loongarch_lasx_xvmini_w:
++  case Intrinsic::loongarch_lasx_xvmini_d:
+     return DAG.getNode(ISD::SMIN, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<5>(N, 2, DAG, /*IsSigned=*/true));
+   case Intrinsic::loongarch_lsx_vmini_bu:
+   case Intrinsic::loongarch_lsx_vmini_hu:
+   case Intrinsic::loongarch_lsx_vmini_wu:
+   case Intrinsic::loongarch_lsx_vmini_du:
++  case Intrinsic::loongarch_lasx_xvmini_bu:
++  case Intrinsic::loongarch_lasx_xvmini_hu:
++  case Intrinsic::loongarch_lasx_xvmini_wu:
++  case Intrinsic::loongarch_lasx_xvmini_du:
+     return DAG.getNode(ISD::UMIN, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<5>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vmul_b:
+   case Intrinsic::loongarch_lsx_vmul_h:
+   case Intrinsic::loongarch_lsx_vmul_w:
+   case Intrinsic::loongarch_lsx_vmul_d:
++  case Intrinsic::loongarch_lasx_xvmul_b:
++  case Intrinsic::loongarch_lasx_xvmul_h:
++  case Intrinsic::loongarch_lasx_xvmul_w:
++  case Intrinsic::loongarch_lasx_xvmul_d:
+     return DAG.getNode(ISD::MUL, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+   case Intrinsic::loongarch_lsx_vmadd_b:
+   case Intrinsic::loongarch_lsx_vmadd_h:
+   case Intrinsic::loongarch_lsx_vmadd_w:
+-  case Intrinsic::loongarch_lsx_vmadd_d: {
++  case Intrinsic::loongarch_lsx_vmadd_d:
++  case Intrinsic::loongarch_lasx_xvmadd_b:
++  case Intrinsic::loongarch_lasx_xvmadd_h:
++  case Intrinsic::loongarch_lasx_xvmadd_w:
++  case Intrinsic::loongarch_lasx_xvmadd_d: {
+     EVT ResTy = N->getValueType(0);
+     return DAG.getNode(ISD::ADD, SDLoc(N), ResTy, N->getOperand(1),
+                        DAG.getNode(ISD::MUL, SDLoc(N), ResTy, N->getOperand(2),
+@@ -2211,7 +2456,11 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
+   case Intrinsic::loongarch_lsx_vmsub_b:
+   case Intrinsic::loongarch_lsx_vmsub_h:
+   case Intrinsic::loongarch_lsx_vmsub_w:
+-  case Intrinsic::loongarch_lsx_vmsub_d: {
++  case Intrinsic::loongarch_lsx_vmsub_d:
++  case Intrinsic::loongarch_lasx_xvmsub_b:
++  case Intrinsic::loongarch_lasx_xvmsub_h:
++  case Intrinsic::loongarch_lasx_xvmsub_w:
++  case Intrinsic::loongarch_lasx_xvmsub_d: {
+     EVT ResTy = N->getValueType(0);
+     return DAG.getNode(ISD::SUB, SDLoc(N), ResTy, N->getOperand(1),
+                        DAG.getNode(ISD::MUL, SDLoc(N), ResTy, N->getOperand(2),
+@@ -2221,125 +2470,188 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
+   case Intrinsic::loongarch_lsx_vdiv_h:
+   case Intrinsic::loongarch_lsx_vdiv_w:
+   case Intrinsic::loongarch_lsx_vdiv_d:
++  case Intrinsic::loongarch_lasx_xvdiv_b:
++  case Intrinsic::loongarch_lasx_xvdiv_h:
++  case Intrinsic::loongarch_lasx_xvdiv_w:
++  case Intrinsic::loongarch_lasx_xvdiv_d:
+     return DAG.getNode(ISD::SDIV, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+   case Intrinsic::loongarch_lsx_vdiv_bu:
+   case Intrinsic::loongarch_lsx_vdiv_hu:
+   case Intrinsic::loongarch_lsx_vdiv_wu:
+   case Intrinsic::loongarch_lsx_vdiv_du:
++  case Intrinsic::loongarch_lasx_xvdiv_bu:
++  case Intrinsic::loongarch_lasx_xvdiv_hu:
++  case Intrinsic::loongarch_lasx_xvdiv_wu:
++  case Intrinsic::loongarch_lasx_xvdiv_du:
+     return DAG.getNode(ISD::UDIV, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+   case Intrinsic::loongarch_lsx_vmod_b:
+   case Intrinsic::loongarch_lsx_vmod_h:
+   case Intrinsic::loongarch_lsx_vmod_w:
+   case Intrinsic::loongarch_lsx_vmod_d:
++  case Intrinsic::loongarch_lasx_xvmod_b:
++  case Intrinsic::loongarch_lasx_xvmod_h:
++  case Intrinsic::loongarch_lasx_xvmod_w:
++  case Intrinsic::loongarch_lasx_xvmod_d:
+     return DAG.getNode(ISD::SREM, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+   case Intrinsic::loongarch_lsx_vmod_bu:
+   case Intrinsic::loongarch_lsx_vmod_hu:
+   case Intrinsic::loongarch_lsx_vmod_wu:
+   case Intrinsic::loongarch_lsx_vmod_du:
++  case Intrinsic::loongarch_lasx_xvmod_bu:
++  case Intrinsic::loongarch_lasx_xvmod_hu:
++  case Intrinsic::loongarch_lasx_xvmod_wu:
++  case Intrinsic::loongarch_lasx_xvmod_du:
+     return DAG.getNode(ISD::UREM, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+   case Intrinsic::loongarch_lsx_vand_v:
++  case Intrinsic::loongarch_lasx_xvand_v:
+     return DAG.getNode(ISD::AND, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+   case Intrinsic::loongarch_lsx_vor_v:
++  case Intrinsic::loongarch_lasx_xvor_v:
+     return DAG.getNode(ISD::OR, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+   case Intrinsic::loongarch_lsx_vxor_v:
++  case Intrinsic::loongarch_lasx_xvxor_v:
+     return DAG.getNode(ISD::XOR, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+-  case Intrinsic::loongarch_lsx_vnor_v: {
++  case Intrinsic::loongarch_lsx_vnor_v:
++  case Intrinsic::loongarch_lasx_xvnor_v: {
+     SDValue Res = DAG.getNode(ISD::OR, DL, N->getValueType(0), N->getOperand(1),
+                               N->getOperand(2));
+     return DAG.getNOT(DL, Res, Res->getValueType(0));
+   }
+   case Intrinsic::loongarch_lsx_vandi_b:
++  case Intrinsic::loongarch_lasx_xvandi_b:
+     return DAG.getNode(ISD::AND, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<8>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vori_b:
++  case Intrinsic::loongarch_lasx_xvori_b:
+     return DAG.getNode(ISD::OR, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<8>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vxori_b:
++  case Intrinsic::loongarch_lasx_xvxori_b:
+     return DAG.getNode(ISD::XOR, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<8>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vsll_b:
+   case Intrinsic::loongarch_lsx_vsll_h:
+   case Intrinsic::loongarch_lsx_vsll_w:
+   case Intrinsic::loongarch_lsx_vsll_d:
++  case Intrinsic::loongarch_lasx_xvsll_b:
++  case Intrinsic::loongarch_lasx_xvsll_h:
++  case Intrinsic::loongarch_lasx_xvsll_w:
++  case Intrinsic::loongarch_lasx_xvsll_d:
+     return DAG.getNode(ISD::SHL, DL, N->getValueType(0), N->getOperand(1),
+                        truncateVecElts(N, DAG));
+   case Intrinsic::loongarch_lsx_vslli_b:
++  case Intrinsic::loongarch_lasx_xvslli_b:
+     return DAG.getNode(ISD::SHL, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<3>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vslli_h:
++  case Intrinsic::loongarch_lasx_xvslli_h:
+     return DAG.getNode(ISD::SHL, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<4>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vslli_w:
++  case Intrinsic::loongarch_lasx_xvslli_w:
+     return DAG.getNode(ISD::SHL, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<5>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vslli_d:
++  case Intrinsic::loongarch_lasx_xvslli_d:
+     return DAG.getNode(ISD::SHL, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<6>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vsrl_b:
+   case Intrinsic::loongarch_lsx_vsrl_h:
+   case Intrinsic::loongarch_lsx_vsrl_w:
+   case Intrinsic::loongarch_lsx_vsrl_d:
++  case Intrinsic::loongarch_lasx_xvsrl_b:
++  case Intrinsic::loongarch_lasx_xvsrl_h:
++  case Intrinsic::loongarch_lasx_xvsrl_w:
++  case Intrinsic::loongarch_lasx_xvsrl_d:
+     return DAG.getNode(ISD::SRL, DL, N->getValueType(0), N->getOperand(1),
+                        truncateVecElts(N, DAG));
+   case Intrinsic::loongarch_lsx_vsrli_b:
++  case Intrinsic::loongarch_lasx_xvsrli_b:
+     return DAG.getNode(ISD::SRL, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<3>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vsrli_h:
++  case Intrinsic::loongarch_lasx_xvsrli_h:
+     return DAG.getNode(ISD::SRL, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<4>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vsrli_w:
++  case Intrinsic::loongarch_lasx_xvsrli_w:
+     return DAG.getNode(ISD::SRL, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<5>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vsrli_d:
++  case Intrinsic::loongarch_lasx_xvsrli_d:
+     return DAG.getNode(ISD::SRL, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<6>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vsra_b:
+   case Intrinsic::loongarch_lsx_vsra_h:
+   case Intrinsic::loongarch_lsx_vsra_w:
+   case Intrinsic::loongarch_lsx_vsra_d:
++  case Intrinsic::loongarch_lasx_xvsra_b:
++  case Intrinsic::loongarch_lasx_xvsra_h:
++  case Intrinsic::loongarch_lasx_xvsra_w:
++  case Intrinsic::loongarch_lasx_xvsra_d:
+     return DAG.getNode(ISD::SRA, DL, N->getValueType(0), N->getOperand(1),
+                        truncateVecElts(N, DAG));
+   case Intrinsic::loongarch_lsx_vsrai_b:
++  case Intrinsic::loongarch_lasx_xvsrai_b:
+     return DAG.getNode(ISD::SRA, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<3>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vsrai_h:
++  case Intrinsic::loongarch_lasx_xvsrai_h:
+     return DAG.getNode(ISD::SRA, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<4>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vsrai_w:
++  case Intrinsic::loongarch_lasx_xvsrai_w:
+     return DAG.getNode(ISD::SRA, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<5>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vsrai_d:
++  case Intrinsic::loongarch_lasx_xvsrai_d:
+     return DAG.getNode(ISD::SRA, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<6>(N, 2, DAG));
+   case Intrinsic::loongarch_lsx_vpcnt_b:
+   case Intrinsic::loongarch_lsx_vpcnt_h:
+   case Intrinsic::loongarch_lsx_vpcnt_w:
+   case Intrinsic::loongarch_lsx_vpcnt_d:
++  case Intrinsic::loongarch_lasx_xvpcnt_b:
++  case Intrinsic::loongarch_lasx_xvpcnt_h:
++  case Intrinsic::loongarch_lasx_xvpcnt_w:
++  case Intrinsic::loongarch_lasx_xvpcnt_d:
+     return DAG.getNode(ISD::CTPOP, DL, N->getValueType(0), N->getOperand(1));
+   case Intrinsic::loongarch_lsx_vbitclr_b:
+   case Intrinsic::loongarch_lsx_vbitclr_h:
+   case Intrinsic::loongarch_lsx_vbitclr_w:
+   case Intrinsic::loongarch_lsx_vbitclr_d:
++  case Intrinsic::loongarch_lasx_xvbitclr_b:
++  case Intrinsic::loongarch_lasx_xvbitclr_h:
++  case Intrinsic::loongarch_lasx_xvbitclr_w:
++  case Intrinsic::loongarch_lasx_xvbitclr_d:
+     return lowerVectorBitClear(N, DAG);
+   case Intrinsic::loongarch_lsx_vbitclri_b:
++  case Intrinsic::loongarch_lasx_xvbitclri_b:
+     return lowerVectorBitClearImm<3>(N, DAG);
+   case Intrinsic::loongarch_lsx_vbitclri_h:
++  case Intrinsic::loongarch_lasx_xvbitclri_h:
+     return lowerVectorBitClearImm<4>(N, DAG);
+   case Intrinsic::loongarch_lsx_vbitclri_w:
++  case Intrinsic::loongarch_lasx_xvbitclri_w:
+     return lowerVectorBitClearImm<5>(N, DAG);
+   case Intrinsic::loongarch_lsx_vbitclri_d:
++  case Intrinsic::loongarch_lasx_xvbitclri_d:
+     return lowerVectorBitClearImm<6>(N, DAG);
+   case Intrinsic::loongarch_lsx_vbitset_b:
+   case Intrinsic::loongarch_lsx_vbitset_h:
+   case Intrinsic::loongarch_lsx_vbitset_w:
+-  case Intrinsic::loongarch_lsx_vbitset_d: {
++  case Intrinsic::loongarch_lsx_vbitset_d:
++  case Intrinsic::loongarch_lasx_xvbitset_b:
++  case Intrinsic::loongarch_lasx_xvbitset_h:
++  case Intrinsic::loongarch_lasx_xvbitset_w:
++  case Intrinsic::loongarch_lasx_xvbitset_d: {
+     EVT VecTy = N->getValueType(0);
+     SDValue One = DAG.getConstant(1, DL, VecTy);
+     return DAG.getNode(
+@@ -2347,17 +2659,25 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
+         DAG.getNode(ISD::SHL, DL, VecTy, One, truncateVecElts(N, DAG)));
+   }
+   case Intrinsic::loongarch_lsx_vbitseti_b:
++  case Intrinsic::loongarch_lasx_xvbitseti_b:
+     return lowerVectorBitSetImm<3>(N, DAG);
+   case Intrinsic::loongarch_lsx_vbitseti_h:
++  case Intrinsic::loongarch_lasx_xvbitseti_h:
+     return lowerVectorBitSetImm<4>(N, DAG);
+   case Intrinsic::loongarch_lsx_vbitseti_w:
++  case Intrinsic::loongarch_lasx_xvbitseti_w:
+     return lowerVectorBitSetImm<5>(N, DAG);
+   case Intrinsic::loongarch_lsx_vbitseti_d:
++  case Intrinsic::loongarch_lasx_xvbitseti_d:
+     return lowerVectorBitSetImm<6>(N, DAG);
+   case Intrinsic::loongarch_lsx_vbitrev_b:
+   case Intrinsic::loongarch_lsx_vbitrev_h:
+   case Intrinsic::loongarch_lsx_vbitrev_w:
+-  case Intrinsic::loongarch_lsx_vbitrev_d: {
++  case Intrinsic::loongarch_lsx_vbitrev_d:
++  case Intrinsic::loongarch_lasx_xvbitrev_b:
++  case Intrinsic::loongarch_lasx_xvbitrev_h:
++  case Intrinsic::loongarch_lasx_xvbitrev_w:
++  case Intrinsic::loongarch_lasx_xvbitrev_d: {
+     EVT VecTy = N->getValueType(0);
+     SDValue One = DAG.getConstant(1, DL, VecTy);
+     return DAG.getNode(
+@@ -2365,31 +2685,45 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
+         DAG.getNode(ISD::SHL, DL, VecTy, One, truncateVecElts(N, DAG)));
+   }
+   case Intrinsic::loongarch_lsx_vbitrevi_b:
++  case Intrinsic::loongarch_lasx_xvbitrevi_b:
+     return lowerVectorBitRevImm<3>(N, DAG);
+   case Intrinsic::loongarch_lsx_vbitrevi_h:
++  case Intrinsic::loongarch_lasx_xvbitrevi_h:
+     return lowerVectorBitRevImm<4>(N, DAG);
+   case Intrinsic::loongarch_lsx_vbitrevi_w:
++  case Intrinsic::loongarch_lasx_xvbitrevi_w:
+     return lowerVectorBitRevImm<5>(N, DAG);
+   case Intrinsic::loongarch_lsx_vbitrevi_d:
++  case Intrinsic::loongarch_lasx_xvbitrevi_d:
+     return lowerVectorBitRevImm<6>(N, DAG);
+   case Intrinsic::loongarch_lsx_vfadd_s:
+   case Intrinsic::loongarch_lsx_vfadd_d:
++  case Intrinsic::loongarch_lasx_xvfadd_s:
++  case Intrinsic::loongarch_lasx_xvfadd_d:
+     return DAG.getNode(ISD::FADD, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+   case Intrinsic::loongarch_lsx_vfsub_s:
+   case Intrinsic::loongarch_lsx_vfsub_d:
++  case Intrinsic::loongarch_lasx_xvfsub_s:
++  case Intrinsic::loongarch_lasx_xvfsub_d:
+     return DAG.getNode(ISD::FSUB, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+   case Intrinsic::loongarch_lsx_vfmul_s:
+   case Intrinsic::loongarch_lsx_vfmul_d:
++  case Intrinsic::loongarch_lasx_xvfmul_s:
++  case Intrinsic::loongarch_lasx_xvfmul_d:
+     return DAG.getNode(ISD::FMUL, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+   case Intrinsic::loongarch_lsx_vfdiv_s:
+   case Intrinsic::loongarch_lsx_vfdiv_d:
++  case Intrinsic::loongarch_lasx_xvfdiv_s:
++  case Intrinsic::loongarch_lasx_xvfdiv_d:
+     return DAG.getNode(ISD::FDIV, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2));
+   case Intrinsic::loongarch_lsx_vfmadd_s:
+   case Intrinsic::loongarch_lsx_vfmadd_d:
++  case Intrinsic::loongarch_lasx_xvfmadd_s:
++  case Intrinsic::loongarch_lasx_xvfmadd_d:
+     return DAG.getNode(ISD::FMA, DL, N->getValueType(0), N->getOperand(1),
+                        N->getOperand(2), N->getOperand(3));
+   case Intrinsic::loongarch_lsx_vinsgr2vr_b:
+@@ -2397,10 +2731,12 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
+                        N->getOperand(1), N->getOperand(2),
+                        legalizeIntrinsicImmArg<4>(N, 3, DAG, Subtarget));
+   case Intrinsic::loongarch_lsx_vinsgr2vr_h:
++  case Intrinsic::loongarch_lasx_xvinsgr2vr_w:
+     return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), N->getValueType(0),
+                        N->getOperand(1), N->getOperand(2),
+                        legalizeIntrinsicImmArg<3>(N, 3, DAG, Subtarget));
+   case Intrinsic::loongarch_lsx_vinsgr2vr_w:
++  case Intrinsic::loongarch_lasx_xvinsgr2vr_d:
+     return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), N->getValueType(0),
+                        N->getOperand(1), N->getOperand(2),
+                        legalizeIntrinsicImmArg<2>(N, 3, DAG, Subtarget));
+@@ -2411,7 +2747,11 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
+   case Intrinsic::loongarch_lsx_vreplgr2vr_b:
+   case Intrinsic::loongarch_lsx_vreplgr2vr_h:
+   case Intrinsic::loongarch_lsx_vreplgr2vr_w:
+-  case Intrinsic::loongarch_lsx_vreplgr2vr_d: {
++  case Intrinsic::loongarch_lsx_vreplgr2vr_d:
++  case Intrinsic::loongarch_lasx_xvreplgr2vr_b:
++  case Intrinsic::loongarch_lasx_xvreplgr2vr_h:
++  case Intrinsic::loongarch_lasx_xvreplgr2vr_w:
++  case Intrinsic::loongarch_lasx_xvreplgr2vr_d: {
+     EVT ResTy = N->getValueType(0);
+     SmallVector<SDValue> Ops(ResTy.getVectorNumElements(), N->getOperand(1));
+     return DAG.getBuildVector(ResTy, DL, Ops);
+@@ -2420,6 +2760,10 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
+   case Intrinsic::loongarch_lsx_vreplve_h:
+   case Intrinsic::loongarch_lsx_vreplve_w:
+   case Intrinsic::loongarch_lsx_vreplve_d:
++  case Intrinsic::loongarch_lasx_xvreplve_b:
++  case Intrinsic::loongarch_lasx_xvreplve_h:
++  case Intrinsic::loongarch_lasx_xvreplve_w:
++  case Intrinsic::loongarch_lasx_xvreplve_d:
+     return DAG.getNode(LoongArchISD::VREPLVE, DL, N->getValueType(0),
+                        N->getOperand(1),
+                        DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getGRLenVT(),
+@@ -2534,6 +2878,36 @@ emitVecCondBranchPseudo(MachineInstr &MI, MachineBasicBlock *BB,
+   case LoongArch::PseudoVBNZ_D:
+     CondOpc = LoongArch::VSETALLNEZ_D;
+     break;
++  case LoongArch::PseudoXVBZ:
++    CondOpc = LoongArch::XVSETEQZ_V;
++    break;
++  case LoongArch::PseudoXVBZ_B:
++    CondOpc = LoongArch::XVSETANYEQZ_B;
++    break;
++  case LoongArch::PseudoXVBZ_H:
++    CondOpc = LoongArch::XVSETANYEQZ_H;
++    break;
++  case LoongArch::PseudoXVBZ_W:
++    CondOpc = LoongArch::XVSETANYEQZ_W;
++    break;
++  case LoongArch::PseudoXVBZ_D:
++    CondOpc = LoongArch::XVSETANYEQZ_D;
++    break;
++  case LoongArch::PseudoXVBNZ:
++    CondOpc = LoongArch::XVSETNEZ_V;
++    break;
++  case LoongArch::PseudoXVBNZ_B:
++    CondOpc = LoongArch::XVSETALLNEZ_B;
++    break;
++  case LoongArch::PseudoXVBNZ_H:
++    CondOpc = LoongArch::XVSETALLNEZ_H;
++    break;
++  case LoongArch::PseudoXVBNZ_W:
++    CondOpc = LoongArch::XVSETALLNEZ_W;
++    break;
++  case LoongArch::PseudoXVBNZ_D:
++    CondOpc = LoongArch::XVSETALLNEZ_D;
++    break;
+   }
+ 
+   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+@@ -2636,6 +3010,16 @@ MachineBasicBlock *LoongArchTargetLowering::EmitInstrWithCustomInserter(
+   case LoongArch::PseudoVBNZ_H:
+   case LoongArch::PseudoVBNZ_W:
+   case LoongArch::PseudoVBNZ_D:
++  case LoongArch::PseudoXVBZ:
++  case LoongArch::PseudoXVBZ_B:
++  case LoongArch::PseudoXVBZ_H:
++  case LoongArch::PseudoXVBZ_W:
++  case LoongArch::PseudoXVBZ_D:
++  case LoongArch::PseudoXVBNZ:
++  case LoongArch::PseudoXVBNZ_B:
++  case LoongArch::PseudoXVBNZ_H:
++  case LoongArch::PseudoXVBNZ_W:
++  case LoongArch::PseudoXVBNZ_D:
+     return emitVecCondBranchPseudo(MI, BB, Subtarget);
+   }
+ }
+@@ -2746,6 +3130,10 @@ const MCPhysReg ArgVRs[] = {LoongArch::VR0, LoongArch::VR1, LoongArch::VR2,
+                             LoongArch::VR3, LoongArch::VR4, LoongArch::VR5,
+                             LoongArch::VR6, LoongArch::VR7};
+ 
++const MCPhysReg ArgXRs[] = {LoongArch::XR0, LoongArch::XR1, LoongArch::XR2,
++                            LoongArch::XR3, LoongArch::XR4, LoongArch::XR5,
++                            LoongArch::XR6, LoongArch::XR7};
++
+ // Pass a 2*GRLen argument that has been split into two GRLen values through
+ // registers or the stack as necessary.
+ static bool CC_LoongArchAssign2GRLen(unsigned GRLen, CCState &State,
+@@ -2894,6 +3282,8 @@ static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI,
+     Reg = State.AllocateReg(ArgFPR64s);
+   else if (ValVT.is128BitVector())
+     Reg = State.AllocateReg(ArgVRs);
++  else if (ValVT.is256BitVector())
++    Reg = State.AllocateReg(ArgXRs);
+   else
+     Reg = State.AllocateReg(ArgGPRs);
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+index a5d66ebac96a..ddd1c9943fac 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+@@ -55,6 +55,14 @@ void LoongArchInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+     return;
+   }
+ 
++  // XR->XR copies.
++  if (LoongArch::LASX256RegClass.contains(DstReg, SrcReg)) {
++    BuildMI(MBB, MBBI, DL, get(LoongArch::XVORI_B), DstReg)
++        .addReg(SrcReg, getKillRegState(KillSrc))
++        .addImm(0);
++    return;
++  }
++
+   // GPR->CFR copy.
+   if (LoongArch::CFRRegClass.contains(DstReg) &&
+       LoongArch::GPRRegClass.contains(SrcReg)) {
+@@ -109,6 +117,8 @@ void LoongArchInstrInfo::storeRegToStackSlot(
+     Opcode = LoongArch::FST_D;
+   else if (LoongArch::LSX128RegClass.hasSubClassEq(RC))
+     Opcode = LoongArch::VST;
++  else if (LoongArch::LASX256RegClass.hasSubClassEq(RC))
++    Opcode = LoongArch::XVST;
+   else if (LoongArch::CFRRegClass.hasSubClassEq(RC))
+     Opcode = LoongArch::PseudoST_CFR;
+   else
+@@ -145,6 +155,8 @@ void LoongArchInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+     Opcode = LoongArch::FLD_D;
+   else if (LoongArch::LSX128RegClass.hasSubClassEq(RC))
+     Opcode = LoongArch::VLD;
++  else if (LoongArch::LASX256RegClass.hasSubClassEq(RC))
++    Opcode = LoongArch::XVLD;
+   else if (LoongArch::CFRRegClass.hasSubClassEq(RC))
+     Opcode = LoongArch::PseudoLD_CFR;
+   else
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index dc37b37b2186..a3afd4789dfc 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -10,6 +10,30 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++def lasxsplati8
++  : PatFrag<(ops node:$e0),
++            (v32i8 (build_vector node:$e0, node:$e0, node:$e0, node:$e0,
++                                 node:$e0, node:$e0, node:$e0, node:$e0,
++                                 node:$e0, node:$e0, node:$e0, node:$e0,
++                                 node:$e0, node:$e0, node:$e0, node:$e0,
++                                 node:$e0, node:$e0, node:$e0, node:$e0,
++                                 node:$e0, node:$e0, node:$e0, node:$e0,
++                                 node:$e0, node:$e0, node:$e0, node:$e0,
++                                 node:$e0, node:$e0, node:$e0, node:$e0))>;
++def lasxsplati16
++  : PatFrag<(ops node:$e0),
++            (v16i16 (build_vector node:$e0, node:$e0, node:$e0, node:$e0,
++                                  node:$e0, node:$e0, node:$e0, node:$e0,
++                                  node:$e0, node:$e0, node:$e0, node:$e0,
++                                  node:$e0, node:$e0, node:$e0, node:$e0))>;
++def lasxsplati32
++  : PatFrag<(ops node:$e0),
++            (v8i32 (build_vector node:$e0, node:$e0, node:$e0, node:$e0,
++                                 node:$e0, node:$e0, node:$e0, node:$e0))>;
++def lasxsplati64
++  : PatFrag<(ops node:$e0),
++            (v4i64 (build_vector node:$e0, node:$e0, node:$e0, node:$e0))>;
++
+ //===----------------------------------------------------------------------===//
+ // Instruction class templates
+ //===----------------------------------------------------------------------===//
+@@ -1029,4 +1053,682 @@ def PseudoXVREPLI_D : Pseudo<(outs LASX256:$xd), (ins simm10:$imm), [],
+                              "xvrepli.d", "$xd, $imm">;
+ }
+ 
++def PseudoXVBNZ_B : VecCond<loongarch_vall_nonzero, v32i8, LASX256>;
++def PseudoXVBNZ_H : VecCond<loongarch_vall_nonzero, v16i16, LASX256>;
++def PseudoXVBNZ_W : VecCond<loongarch_vall_nonzero, v8i32, LASX256>;
++def PseudoXVBNZ_D : VecCond<loongarch_vall_nonzero, v4i64, LASX256>;
++def PseudoXVBNZ : VecCond<loongarch_vany_nonzero, v32i8, LASX256>;
++
++def PseudoXVBZ_B : VecCond<loongarch_vall_zero, v32i8, LASX256>;
++def PseudoXVBZ_H : VecCond<loongarch_vall_zero, v16i16, LASX256>;
++def PseudoXVBZ_W : VecCond<loongarch_vall_zero, v8i32, LASX256>;
++def PseudoXVBZ_D : VecCond<loongarch_vall_zero, v4i64, LASX256>;
++def PseudoXVBZ : VecCond<loongarch_vany_zero, v32i8, LASX256>;
++
++} // Predicates = [HasExtLASX]
++
++multiclass PatXr<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(v32i8 (OpNode (v32i8 LASX256:$xj))),
++            (!cast<LAInst>(Inst#"_B") LASX256:$xj)>;
++  def : Pat<(v16i16 (OpNode (v16i16 LASX256:$xj))),
++            (!cast<LAInst>(Inst#"_H") LASX256:$xj)>;
++  def : Pat<(v8i32 (OpNode (v8i32 LASX256:$xj))),
++            (!cast<LAInst>(Inst#"_W") LASX256:$xj)>;
++  def : Pat<(v4i64 (OpNode (v4i64 LASX256:$xj))),
++            (!cast<LAInst>(Inst#"_D") LASX256:$xj)>;
++}
++
++multiclass PatXrXr<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(OpNode (v32i8 LASX256:$xj), (v32i8 LASX256:$xk)),
++            (!cast<LAInst>(Inst#"_B") LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(OpNode (v16i16 LASX256:$xj), (v16i16 LASX256:$xk)),
++            (!cast<LAInst>(Inst#"_H") LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(OpNode (v8i32 LASX256:$xj), (v8i32 LASX256:$xk)),
++            (!cast<LAInst>(Inst#"_W") LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(OpNode (v4i64 LASX256:$xj), (v4i64 LASX256:$xk)),
++            (!cast<LAInst>(Inst#"_D") LASX256:$xj, LASX256:$xk)>;
++}
++
++multiclass PatXrXrF<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(OpNode (v8f32 LASX256:$xj), (v8f32 LASX256:$xk)),
++            (!cast<LAInst>(Inst#"_S") LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(OpNode (v4f64 LASX256:$xj), (v4f64 LASX256:$xk)),
++            (!cast<LAInst>(Inst#"_D") LASX256:$xj, LASX256:$xk)>;
++}
++
++multiclass PatXrXrU<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(OpNode (v32i8 LASX256:$xj), (v32i8 LASX256:$xk)),
++            (!cast<LAInst>(Inst#"_BU") LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(OpNode (v16i16 LASX256:$xj), (v16i16 LASX256:$xk)),
++            (!cast<LAInst>(Inst#"_HU") LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(OpNode (v8i32 LASX256:$xj), (v8i32 LASX256:$xk)),
++            (!cast<LAInst>(Inst#"_WU") LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(OpNode (v4i64 LASX256:$xj), (v4i64 LASX256:$xk)),
++            (!cast<LAInst>(Inst#"_DU") LASX256:$xj, LASX256:$xk)>;
++}
++
++multiclass PatXrSimm5<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(OpNode (v32i8 LASX256:$xj), (v32i8 (SplatPat_simm5 simm5:$imm))),
++            (!cast<LAInst>(Inst#"_B") LASX256:$xj, simm5:$imm)>;
++  def : Pat<(OpNode (v16i16 LASX256:$xj), (v16i16 (SplatPat_simm5 simm5:$imm))),
++            (!cast<LAInst>(Inst#"_H") LASX256:$xj, simm5:$imm)>;
++  def : Pat<(OpNode (v8i32 LASX256:$xj), (v8i32 (SplatPat_simm5 simm5:$imm))),
++            (!cast<LAInst>(Inst#"_W") LASX256:$xj, simm5:$imm)>;
++  def : Pat<(OpNode (v4i64 LASX256:$xj), (v4i64 (SplatPat_simm5 simm5:$imm))),
++            (!cast<LAInst>(Inst#"_D") LASX256:$xj, simm5:$imm)>;
++}
++
++multiclass PatXrUimm5<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(OpNode (v32i8 LASX256:$xj), (v32i8 (SplatPat_uimm5 uimm5:$imm))),
++            (!cast<LAInst>(Inst#"_BU") LASX256:$xj, uimm5:$imm)>;
++  def : Pat<(OpNode (v16i16 LASX256:$xj), (v16i16 (SplatPat_uimm5 uimm5:$imm))),
++            (!cast<LAInst>(Inst#"_HU") LASX256:$xj, uimm5:$imm)>;
++  def : Pat<(OpNode (v8i32 LASX256:$xj), (v8i32 (SplatPat_uimm5 uimm5:$imm))),
++            (!cast<LAInst>(Inst#"_WU") LASX256:$xj, uimm5:$imm)>;
++  def : Pat<(OpNode (v4i64 LASX256:$xj), (v4i64 (SplatPat_uimm5 uimm5:$imm))),
++            (!cast<LAInst>(Inst#"_DU") LASX256:$xj, uimm5:$imm)>;
++}
++
++multiclass PatXrXrXr<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(OpNode (v32i8 LASX256:$xd), (v32i8 LASX256:$xj),
++                    (v32i8 LASX256:$xk)),
++            (!cast<LAInst>(Inst#"_B") LASX256:$xd, LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(OpNode (v16i16 LASX256:$xd), (v16i16 LASX256:$xj),
++                    (v16i16 LASX256:$xk)),
++            (!cast<LAInst>(Inst#"_H") LASX256:$xd, LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(OpNode (v8i32 LASX256:$xd), (v8i32 LASX256:$xj),
++                    (v8i32 LASX256:$xk)),
++            (!cast<LAInst>(Inst#"_W") LASX256:$xd, LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(OpNode (v4i64 LASX256:$xd), (v4i64 LASX256:$xj),
++                    (v4i64 LASX256:$xk)),
++            (!cast<LAInst>(Inst#"_D") LASX256:$xd, LASX256:$xj, LASX256:$xk)>;
++}
++
++multiclass PatShiftXrXr<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(OpNode (v32i8 LASX256:$xj), (and vsplati8_imm_eq_7,
++                                              (v32i8 LASX256:$xk))),
++            (!cast<LAInst>(Inst#"_B") LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(OpNode (v16i16 LASX256:$xj), (and vsplati16_imm_eq_15,
++                                               (v16i16 LASX256:$xk))),
++            (!cast<LAInst>(Inst#"_H") LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(OpNode (v8i32 LASX256:$xj), (and vsplati32_imm_eq_31,
++                                              (v8i32 LASX256:$xk))),
++            (!cast<LAInst>(Inst#"_W") LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(OpNode (v4i64 LASX256:$xj), (and vsplati64_imm_eq_63,
++                                              (v4i64 LASX256:$xk))),
++            (!cast<LAInst>(Inst#"_D") LASX256:$xj, LASX256:$xk)>;
++}
++
++multiclass PatShiftXrUimm<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(OpNode (v32i8 LASX256:$xj), (v32i8 (SplatPat_uimm3 uimm3:$imm))),
++            (!cast<LAInst>(Inst#"_B") LASX256:$xj, uimm3:$imm)>;
++  def : Pat<(OpNode (v16i16 LASX256:$xj), (v16i16 (SplatPat_uimm4 uimm4:$imm))),
++            (!cast<LAInst>(Inst#"_H") LASX256:$xj, uimm4:$imm)>;
++  def : Pat<(OpNode (v8i32 LASX256:$xj), (v8i32 (SplatPat_uimm5 uimm5:$imm))),
++            (!cast<LAInst>(Inst#"_W") LASX256:$xj, uimm5:$imm)>;
++  def : Pat<(OpNode (v4i64 LASX256:$xj), (v4i64 (SplatPat_uimm6 uimm6:$imm))),
++            (!cast<LAInst>(Inst#"_D") LASX256:$xj, uimm6:$imm)>;
++}
++
++class PatXrXrB<SDPatternOperator OpNode, LAInst Inst>
++    : Pat<(OpNode (v32i8 LASX256:$xj), (v32i8 LASX256:$xk)),
++          (Inst LASX256:$xj, LASX256:$xk)>;
++
++let Predicates = [HasExtLASX] in {
++
++// XVADD_{B/H/W/D}
++defm : PatXrXr<add, "XVADD">;
++// XVSUB_{B/H/W/D}
++defm : PatXrXr<sub, "XVSUB">;
++
++// XVADDI_{B/H/W/D}U
++defm : PatXrUimm5<add, "XVADDI">;
++// XVSUBI_{B/H/W/D}U
++defm : PatXrUimm5<sub, "XVSUBI">;
++
++// XVNEG_{B/H/W/D}
++def : Pat<(sub immAllZerosV, (v32i8 LASX256:$xj)), (XVNEG_B LASX256:$xj)>;
++def : Pat<(sub immAllZerosV, (v16i16 LASX256:$xj)), (XVNEG_H LASX256:$xj)>;
++def : Pat<(sub immAllZerosV, (v8i32 LASX256:$xj)), (XVNEG_W LASX256:$xj)>;
++def : Pat<(sub immAllZerosV, (v4i64 LASX256:$xj)), (XVNEG_D LASX256:$xj)>;
++
++// XVMAX[I]_{B/H/W/D}[U]
++defm : PatXrXr<smax, "XVMAX">;
++defm : PatXrXrU<umax, "XVMAX">;
++defm : PatXrSimm5<smax, "XVMAXI">;
++defm : PatXrUimm5<umax, "XVMAXI">;
++
++// XVMIN[I]_{B/H/W/D}[U]
++defm : PatXrXr<smin, "XVMIN">;
++defm : PatXrXrU<umin, "XVMIN">;
++defm : PatXrSimm5<smin, "XVMINI">;
++defm : PatXrUimm5<umin, "XVMINI">;
++
++// XVMUL_{B/H/W/D}
++defm : PatXrXr<mul, "XVMUL">;
++
++// XVMADD_{B/H/W/D}
++defm : PatXrXrXr<muladd, "XVMADD">;
++// XVMSUB_{B/H/W/D}
++defm : PatXrXrXr<mulsub, "XVMSUB">;
++
++// XVDIV_{B/H/W/D}[U]
++defm : PatXrXr<sdiv, "XVDIV">;
++defm : PatXrXrU<udiv, "XVDIV">;
++
++// XVMOD_{B/H/W/D}[U]
++defm : PatXrXr<srem, "XVMOD">;
++defm : PatXrXrU<urem, "XVMOD">;
++
++// XVAND_V
++def : PatXrXrB<and, XVAND_V>;
++// XVNOR_V
++def : PatXrXrB<or, XVOR_V>;
++// XVXOR_V
++def : PatXrXrB<xor, XVXOR_V>;
++// XVNOR_V
++def : Pat<(vnot (or (v32i8 LASX256:$xj), (v32i8 LASX256:$xk))),
++          (XVNOR_V LASX256:$xj, LASX256:$xk)>;
++
++// XVANDI_B
++def : Pat<(and (v32i8 LASX256:$xj), (v32i8 (SplatPat_uimm8 uimm8:$imm))),
++          (XVANDI_B LASX256:$xj, uimm8:$imm)>;
++// XVORI_B
++def : Pat<(or (v32i8 LASX256:$xj), (v32i8 (SplatPat_uimm8 uimm8:$imm))),
++          (XVORI_B LASX256:$xj, uimm8:$imm)>;
++
++// XVXORI_B
++def : Pat<(xor (v32i8 LASX256:$xj), (v32i8 (SplatPat_uimm8 uimm8:$imm))),
++          (XVXORI_B LASX256:$xj, uimm8:$imm)>;
++
++// XVSLL[I]_{B/H/W/D}
++defm : PatXrXr<shl, "XVSLL">;
++defm : PatShiftXrXr<shl, "XVSLL">;
++defm : PatShiftXrUimm<shl, "XVSLLI">;
++
++// XVSRL[I]_{B/H/W/D}
++defm : PatXrXr<srl, "XVSRL">;
++defm : PatShiftXrXr<srl, "XVSRL">;
++defm : PatShiftXrUimm<srl, "XVSRLI">;
++
++// XVSRA[I]_{B/H/W/D}
++defm : PatXrXr<sra, "XVSRA">;
++defm : PatShiftXrXr<sra, "XVSRA">;
++defm : PatShiftXrUimm<sra, "XVSRAI">;
++
++// XVPCNT_{B/H/W/D}
++defm : PatXr<ctpop, "XVPCNT">;
++
++// XVBITCLR_{B/H/W/D}
++def : Pat<(and v32i8:$xj, (vnot (shl vsplat_imm_eq_1, v32i8:$xk))),
++          (v32i8 (XVBITCLR_B v32i8:$xj, v32i8:$xk))>;
++def : Pat<(and v16i16:$xj, (vnot (shl vsplat_imm_eq_1, v16i16:$xk))),
++          (v16i16 (XVBITCLR_H v16i16:$xj, v16i16:$xk))>;
++def : Pat<(and v8i32:$xj, (vnot (shl vsplat_imm_eq_1, v8i32:$xk))),
++          (v8i32 (XVBITCLR_W v8i32:$xj, v8i32:$xk))>;
++def : Pat<(and v4i64:$xj, (vnot (shl vsplat_imm_eq_1, v4i64:$xk))),
++          (v4i64 (XVBITCLR_D v4i64:$xj, v4i64:$xk))>;
++def : Pat<(and v32i8:$xj, (vnot (shl vsplat_imm_eq_1,
++                                     (vsplati8imm7 v32i8:$xk)))),
++          (v32i8 (XVBITCLR_B v32i8:$xj, v32i8:$xk))>;
++def : Pat<(and v16i16:$xj, (vnot (shl vsplat_imm_eq_1,
++                                     (vsplati16imm15 v16i16:$xk)))),
++          (v16i16 (XVBITCLR_H v16i16:$xj, v16i16:$xk))>;
++def : Pat<(and v8i32:$xj, (vnot (shl vsplat_imm_eq_1,
++                                     (vsplati32imm31 v8i32:$xk)))),
++          (v8i32 (XVBITCLR_W v8i32:$xj, v8i32:$xk))>;
++def : Pat<(and v4i64:$xj, (vnot (shl vsplat_imm_eq_1,
++                                     (vsplati64imm63 v4i64:$xk)))),
++          (v4i64 (XVBITCLR_D v4i64:$xj, v4i64:$xk))>;
++
++// XVBITCLRI_{B/H/W/D}
++def : Pat<(and (v32i8 LASX256:$xj), (v32i8 (vsplat_uimm_inv_pow2 uimm3:$imm))),
++          (XVBITCLRI_B LASX256:$xj, uimm3:$imm)>;
++def : Pat<(and (v16i16 LASX256:$xj), (v16i16 (vsplat_uimm_inv_pow2 uimm4:$imm))),
++          (XVBITCLRI_H LASX256:$xj, uimm4:$imm)>;
++def : Pat<(and (v8i32 LASX256:$xj), (v8i32 (vsplat_uimm_inv_pow2 uimm5:$imm))),
++          (XVBITCLRI_W LASX256:$xj, uimm5:$imm)>;
++def : Pat<(and (v4i64 LASX256:$xj), (v4i64 (vsplat_uimm_inv_pow2 uimm6:$imm))),
++          (XVBITCLRI_D LASX256:$xj, uimm6:$imm)>;
++
++// XVBITSET_{B/H/W/D}
++def : Pat<(or v32i8:$xj, (shl vsplat_imm_eq_1, v32i8:$xk)),
++          (v32i8 (XVBITSET_B v32i8:$xj, v32i8:$xk))>;
++def : Pat<(or v16i16:$xj, (shl vsplat_imm_eq_1, v16i16:$xk)),
++          (v16i16 (XVBITSET_H v16i16:$xj, v16i16:$xk))>;
++def : Pat<(or v8i32:$xj, (shl vsplat_imm_eq_1, v8i32:$xk)),
++          (v8i32 (XVBITSET_W v8i32:$xj, v8i32:$xk))>;
++def : Pat<(or v4i64:$xj, (shl vsplat_imm_eq_1, v4i64:$xk)),
++          (v4i64 (XVBITSET_D v4i64:$xj, v4i64:$xk))>;
++def : Pat<(or v32i8:$xj, (shl vsplat_imm_eq_1, (vsplati8imm7 v32i8:$xk))),
++          (v32i8 (XVBITSET_B v32i8:$xj, v32i8:$xk))>;
++def : Pat<(or v16i16:$xj, (shl vsplat_imm_eq_1, (vsplati16imm15 v16i16:$xk))),
++          (v16i16 (XVBITSET_H v16i16:$xj, v16i16:$xk))>;
++def : Pat<(or v8i32:$xj, (shl vsplat_imm_eq_1, (vsplati32imm31 v8i32:$xk))),
++          (v8i32 (XVBITSET_W v8i32:$xj, v8i32:$xk))>;
++def : Pat<(or v4i64:$xj, (shl vsplat_imm_eq_1, (vsplati64imm63 v4i64:$xk))),
++          (v4i64 (XVBITSET_D v4i64:$xj, v4i64:$xk))>;
++
++// XVBITSETI_{B/H/W/D}
++def : Pat<(or (v32i8 LASX256:$xj), (v32i8 (vsplat_uimm_pow2 uimm3:$imm))),
++          (XVBITSETI_B LASX256:$xj, uimm3:$imm)>;
++def : Pat<(or (v16i16 LASX256:$xj), (v16i16 (vsplat_uimm_pow2 uimm4:$imm))),
++          (XVBITSETI_H LASX256:$xj, uimm4:$imm)>;
++def : Pat<(or (v8i32 LASX256:$xj), (v8i32 (vsplat_uimm_pow2 uimm5:$imm))),
++          (XVBITSETI_W LASX256:$xj, uimm5:$imm)>;
++def : Pat<(or (v4i64 LASX256:$xj), (v4i64 (vsplat_uimm_pow2 uimm6:$imm))),
++          (XVBITSETI_D LASX256:$xj, uimm6:$imm)>;
++
++// XVBITREV_{B/H/W/D}
++def : Pat<(xor v32i8:$xj, (shl vsplat_imm_eq_1, v32i8:$xk)),
++          (v32i8 (XVBITREV_B v32i8:$xj, v32i8:$xk))>;
++def : Pat<(xor v16i16:$xj, (shl vsplat_imm_eq_1, v16i16:$xk)),
++          (v16i16 (XVBITREV_H v16i16:$xj, v16i16:$xk))>;
++def : Pat<(xor v8i32:$xj, (shl vsplat_imm_eq_1, v8i32:$xk)),
++          (v8i32 (XVBITREV_W v8i32:$xj, v8i32:$xk))>;
++def : Pat<(xor v4i64:$xj, (shl vsplat_imm_eq_1, v4i64:$xk)),
++          (v4i64 (XVBITREV_D v4i64:$xj, v4i64:$xk))>;
++def : Pat<(xor v32i8:$xj, (shl vsplat_imm_eq_1, (vsplati8imm7 v32i8:$xk))),
++          (v32i8 (XVBITREV_B v32i8:$xj, v32i8:$xk))>;
++def : Pat<(xor v16i16:$xj, (shl vsplat_imm_eq_1, (vsplati16imm15 v16i16:$xk))),
++          (v16i16 (XVBITREV_H v16i16:$xj, v16i16:$xk))>;
++def : Pat<(xor v8i32:$xj, (shl vsplat_imm_eq_1, (vsplati32imm31 v8i32:$xk))),
++          (v8i32 (XVBITREV_W v8i32:$xj, v8i32:$xk))>;
++def : Pat<(xor v4i64:$xj, (shl vsplat_imm_eq_1, (vsplati64imm63 v4i64:$xk))),
++          (v4i64 (XVBITREV_D v4i64:$xj, v4i64:$xk))>;
++
++// XVBITREVI_{B/H/W/D}
++def : Pat<(xor (v32i8 LASX256:$xj), (v32i8 (vsplat_uimm_pow2 uimm3:$imm))),
++          (XVBITREVI_B LASX256:$xj, uimm3:$imm)>;
++def : Pat<(xor (v16i16 LASX256:$xj), (v16i16 (vsplat_uimm_pow2 uimm4:$imm))),
++          (XVBITREVI_H LASX256:$xj, uimm4:$imm)>;
++def : Pat<(xor (v8i32 LASX256:$xj), (v8i32 (vsplat_uimm_pow2 uimm5:$imm))),
++          (XVBITREVI_W LASX256:$xj, uimm5:$imm)>;
++def : Pat<(xor (v4i64 LASX256:$xj), (v4i64 (vsplat_uimm_pow2 uimm6:$imm))),
++          (XVBITREVI_D LASX256:$xj, uimm6:$imm)>;
++
++// XVFADD_{S/D}
++defm : PatXrXrF<fadd, "XVFADD">;
++
++// XVFSUB_{S/D}
++defm : PatXrXrF<fsub, "XVFSUB">;
++
++// XVFMUL_{S/D}
++defm : PatXrXrF<fmul, "XVFMUL">;
++
++// XVFDIV_{S/D}
++defm : PatXrXrF<fdiv, "XVFDIV">;
++
++// XVFMADD_{S/D}
++def : Pat<(fma v8f32:$xj, v8f32:$xk, v8f32:$xa),
++          (XVFMADD_S v8f32:$xj, v8f32:$xk, v8f32:$xa)>;
++def : Pat<(fma v4f64:$xj, v4f64:$xk, v4f64:$xa),
++          (XVFMADD_D v4f64:$xj, v4f64:$xk, v4f64:$xa)>;
++
++// XVINSGR2VR_{W/D}
++def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
++          (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
++def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
++          (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;
++
++// XVPICKVE2GR_W[U]
++def : Pat<(loongarch_vpick_sext_elt v8i32:$xd, uimm3:$imm, i32),
++          (XVPICKVE2GR_W v8i32:$xd, uimm3:$imm)>;
++def : Pat<(loongarch_vpick_zext_elt v8i32:$xd, uimm3:$imm, i32),
++          (XVPICKVE2GR_WU v8i32:$xd, uimm3:$imm)>;
++
++// XVREPLGR2VR_{B/H/W/D}
++def : Pat<(lasxsplati8 GPR:$rj), (XVREPLGR2VR_B GPR:$rj)>;
++def : Pat<(lasxsplati16 GPR:$rj), (XVREPLGR2VR_H GPR:$rj)>;
++def : Pat<(lasxsplati32 GPR:$rj), (XVREPLGR2VR_W GPR:$rj)>;
++def : Pat<(lasxsplati64 GPR:$rj), (XVREPLGR2VR_D GPR:$rj)>;
++
++// XVREPLVE_{B/H/W/D}
++def : Pat<(loongarch_vreplve v32i8:$xj, GRLenVT:$rk),
++          (XVREPLVE_B v32i8:$xj, GRLenVT:$rk)>;
++def : Pat<(loongarch_vreplve v16i16:$xj, GRLenVT:$rk),
++          (XVREPLVE_H v16i16:$xj, GRLenVT:$rk)>;
++def : Pat<(loongarch_vreplve v8i32:$xj, GRLenVT:$rk),
++          (XVREPLVE_W v8i32:$xj, GRLenVT:$rk)>;
++def : Pat<(loongarch_vreplve v4i64:$xj, GRLenVT:$rk),
++          (XVREPLVE_D v4i64:$xj, GRLenVT:$rk)>;
++
++// Loads/Stores
++foreach vt = [v32i8, v16i16, v8i32, v4i64] in {
++  defm : LdPat<load, XVLD, vt>;
++  def  : RegRegLdPat<load, XVLDX, vt>;
++  defm : StPat<store, XVST, LASX256, vt>;
++  def  : RegRegStPat<store, XVSTX, LASX256, vt>;
++}
++
++} // Predicates = [HasExtLASX]
++
++/// Intrinsic pattern
++
++class deriveLASXIntrinsic<string Inst> {
++  Intrinsic ret = !cast<Intrinsic>(!tolower("int_loongarch_lasx_"#Inst));
++}
++
++let Predicates = [HasExtLASX] in {
++
++// vty: v32i8/v16i16/v8i32/v4i64
++// Pat<(Intrinsic vty:$xj, vty:$xk),
++//     (LAInst vty:$xj, vty:$xk)>;
++foreach Inst = ["XVSADD_B", "XVSADD_BU", "XVSSUB_B", "XVSSUB_BU",
++                "XVHADDW_H_B", "XVHADDW_HU_BU", "XVHSUBW_H_B", "XVHSUBW_HU_BU",
++                "XVADDWEV_H_B", "XVADDWOD_H_B", "XVSUBWEV_H_B", "XVSUBWOD_H_B",
++                "XVADDWEV_H_BU", "XVADDWOD_H_BU", "XVSUBWEV_H_BU", "XVSUBWOD_H_BU",
++                "XVADDWEV_H_BU_B", "XVADDWOD_H_BU_B",
++                "XVAVG_B", "XVAVG_BU", "XVAVGR_B", "XVAVGR_BU",
++                "XVABSD_B", "XVABSD_BU", "XVADDA_B", "XVMUH_B", "XVMUH_BU",
++                "XVMULWEV_H_B", "XVMULWOD_H_B", "XVMULWEV_H_BU", "XVMULWOD_H_BU",
++                "XVMULWEV_H_BU_B", "XVMULWOD_H_BU_B", "XVSIGNCOV_B",
++                "XVANDN_V", "XVORN_V", "XVROTR_B", "XVSRLR_B", "XVSRAR_B",
++                "XVSEQ_B", "XVSLE_B", "XVSLE_BU", "XVSLT_B", "XVSLT_BU",
++                "XVPACKEV_B", "XVPACKOD_B", "XVPICKEV_B", "XVPICKOD_B",
++                "XVILVL_B", "XVILVH_B"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v32i8 LASX256:$xj), (v32i8 LASX256:$xk)),
++            (!cast<LAInst>(Inst) LASX256:$xj, LASX256:$xk)>;
++foreach Inst = ["XVSADD_H", "XVSADD_HU", "XVSSUB_H", "XVSSUB_HU",
++                "XVHADDW_W_H", "XVHADDW_WU_HU", "XVHSUBW_W_H", "XVHSUBW_WU_HU",
++                "XVADDWEV_W_H", "XVADDWOD_W_H", "XVSUBWEV_W_H", "XVSUBWOD_W_H",
++                "XVADDWEV_W_HU", "XVADDWOD_W_HU", "XVSUBWEV_W_HU", "XVSUBWOD_W_HU",
++                "XVADDWEV_W_HU_H", "XVADDWOD_W_HU_H",
++                "XVAVG_H", "XVAVG_HU", "XVAVGR_H", "XVAVGR_HU",
++                "XVABSD_H", "XVABSD_HU", "XVADDA_H", "XVMUH_H", "XVMUH_HU",
++                "XVMULWEV_W_H", "XVMULWOD_W_H", "XVMULWEV_W_HU", "XVMULWOD_W_HU",
++                "XVMULWEV_W_HU_H", "XVMULWOD_W_HU_H", "XVSIGNCOV_H", "XVROTR_H",
++                "XVSRLR_H", "XVSRAR_H", "XVSRLN_B_H", "XVSRAN_B_H", "XVSRLRN_B_H",
++                "XVSRARN_B_H", "XVSSRLN_B_H", "XVSSRAN_B_H", "XVSSRLN_BU_H",
++                "XVSSRAN_BU_H", "XVSSRLRN_B_H", "XVSSRARN_B_H", "XVSSRLRN_BU_H",
++                "XVSSRARN_BU_H",
++                "XVSEQ_H", "XVSLE_H", "XVSLE_HU", "XVSLT_H", "XVSLT_HU",
++                "XVPACKEV_H", "XVPACKOD_H", "XVPICKEV_H", "XVPICKOD_H",
++                "XVILVL_H", "XVILVH_H"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v16i16 LASX256:$xj), (v16i16 LASX256:$xk)),
++            (!cast<LAInst>(Inst) LASX256:$xj, LASX256:$xk)>;
++foreach Inst = ["XVSADD_W", "XVSADD_WU", "XVSSUB_W", "XVSSUB_WU",
++                "XVHADDW_D_W", "XVHADDW_DU_WU", "XVHSUBW_D_W", "XVHSUBW_DU_WU",
++                "XVADDWEV_D_W", "XVADDWOD_D_W", "XVSUBWEV_D_W", "XVSUBWOD_D_W",
++                "XVADDWEV_D_WU", "XVADDWOD_D_WU", "XVSUBWEV_D_WU", "XVSUBWOD_D_WU",
++                "XVADDWEV_D_WU_W", "XVADDWOD_D_WU_W",
++                "XVAVG_W", "XVAVG_WU", "XVAVGR_W", "XVAVGR_WU",
++                "XVABSD_W", "XVABSD_WU", "XVADDA_W", "XVMUH_W", "XVMUH_WU",
++                "XVMULWEV_D_W", "XVMULWOD_D_W", "XVMULWEV_D_WU", "XVMULWOD_D_WU",
++                "XVMULWEV_D_WU_W", "XVMULWOD_D_WU_W", "XVSIGNCOV_W", "XVROTR_W",
++                "XVSRLR_W", "XVSRAR_W", "XVSRLN_H_W", "XVSRAN_H_W", "XVSRLRN_H_W",
++                "XVSRARN_H_W", "XVSSRLN_H_W", "XVSSRAN_H_W", "XVSSRLN_HU_W",
++                "XVSSRAN_HU_W", "XVSSRLRN_H_W", "XVSSRARN_H_W", "XVSSRLRN_HU_W",
++                "XVSSRARN_HU_W",
++                "XVSEQ_W", "XVSLE_W", "XVSLE_WU", "XVSLT_W", "XVSLT_WU",
++                "XVPACKEV_W", "XVPACKOD_W", "XVPICKEV_W", "XVPICKOD_W",
++                "XVILVL_W", "XVILVH_W", "XVPERM_W"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v8i32 LASX256:$xj), (v8i32 LASX256:$xk)),
++            (!cast<LAInst>(Inst) LASX256:$xj, LASX256:$xk)>;
++foreach Inst = ["XVADD_Q", "XVSUB_Q",
++                "XVSADD_D", "XVSADD_DU", "XVSSUB_D", "XVSSUB_DU",
++                "XVHADDW_Q_D", "XVHADDW_QU_DU", "XVHSUBW_Q_D", "XVHSUBW_QU_DU",
++                "XVADDWEV_Q_D", "XVADDWOD_Q_D", "XVSUBWEV_Q_D", "XVSUBWOD_Q_D",
++                "XVADDWEV_Q_DU", "XVADDWOD_Q_DU", "XVSUBWEV_Q_DU", "XVSUBWOD_Q_DU",
++                "XVADDWEV_Q_DU_D", "XVADDWOD_Q_DU_D",
++                "XVAVG_D", "XVAVG_DU", "XVAVGR_D", "XVAVGR_DU",
++                "XVABSD_D", "XVABSD_DU", "XVADDA_D", "XVMUH_D", "XVMUH_DU",
++                "XVMULWEV_Q_D", "XVMULWOD_Q_D", "XVMULWEV_Q_DU", "XVMULWOD_Q_DU",
++                "XVMULWEV_Q_DU_D", "XVMULWOD_Q_DU_D", "XVSIGNCOV_D", "XVROTR_D",
++                "XVSRLR_D", "XVSRAR_D", "XVSRLN_W_D", "XVSRAN_W_D", "XVSRLRN_W_D",
++                "XVSRARN_W_D", "XVSSRLN_W_D", "XVSSRAN_W_D", "XVSSRLN_WU_D",
++                "XVSSRAN_WU_D", "XVSSRLRN_W_D", "XVSSRARN_W_D", "XVSSRLRN_WU_D",
++                "XVSSRARN_WU_D", "XVFFINT_S_L",
++                "XVSEQ_D", "XVSLE_D", "XVSLE_DU", "XVSLT_D", "XVSLT_DU",
++                "XVPACKEV_D", "XVPACKOD_D", "XVPICKEV_D", "XVPICKOD_D",
++                "XVILVL_D", "XVILVH_D"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v4i64 LASX256:$xj), (v4i64 LASX256:$xk)),
++            (!cast<LAInst>(Inst) LASX256:$xj, LASX256:$xk)>;
++
++// vty: v32i8/v16i16/v8i32/v4i64
++// Pat<(Intrinsic vty:$xd, vty:$xj, vty:$xk),
++//     (LAInst vty:$xd, vty:$xj, vty:$xk)>;
++foreach Inst = ["XVMADDWEV_H_B", "XVMADDWOD_H_B", "XVMADDWEV_H_BU",
++                "XVMADDWOD_H_BU", "XVMADDWEV_H_BU_B", "XVMADDWOD_H_BU_B"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v16i16 LASX256:$xd), (v32i8 LASX256:$xj), (v32i8 LASX256:$xk)),
++            (!cast<LAInst>(Inst) LASX256:$xd, LASX256:$xj, LASX256:$xk)>;
++foreach Inst = ["XVMADDWEV_W_H", "XVMADDWOD_W_H", "XVMADDWEV_W_HU",
++                "XVMADDWOD_W_HU", "XVMADDWEV_W_HU_H", "XVMADDWOD_W_HU_H"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v8i32 LASX256:$xd), (v16i16 LASX256:$xj), (v16i16 LASX256:$xk)),
++            (!cast<LAInst>(Inst) LASX256:$xd, LASX256:$xj, LASX256:$xk)>;
++foreach Inst = ["XVMADDWEV_D_W", "XVMADDWOD_D_W", "XVMADDWEV_D_WU",
++                "XVMADDWOD_D_WU", "XVMADDWEV_D_WU_W", "XVMADDWOD_D_WU_W"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v4i64 LASX256:$xd), (v8i32 LASX256:$xj), (v8i32 LASX256:$xk)),
++            (!cast<LAInst>(Inst) LASX256:$xd, LASX256:$xj, LASX256:$xk)>;
++foreach Inst = ["XVMADDWEV_Q_D", "XVMADDWOD_Q_D", "XVMADDWEV_Q_DU",
++                "XVMADDWOD_Q_DU", "XVMADDWEV_Q_DU_D", "XVMADDWOD_Q_DU_D"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v4i64 LASX256:$xd), (v4i64 LASX256:$xj), (v4i64 LASX256:$xk)),
++            (!cast<LAInst>(Inst) LASX256:$xd, LASX256:$xj, LASX256:$xk)>;
++
++// vty: v32i8/v16i16/v8i32/v4i64
++// Pat<(Intrinsic vty:$xj),
++//     (LAInst vty:$xj)>;
++foreach Inst = ["XVEXTH_H_B", "XVEXTH_HU_BU",
++                "XVMSKLTZ_B", "XVMSKGEZ_B", "XVMSKNZ_B",
++                "XVCLO_B", "XVCLZ_B", "VEXT2XV_H_B", "VEXT2XV_HU_BU",
++                "VEXT2XV_W_B", "VEXT2XV_WU_BU", "VEXT2XV_D_B",
++                "VEXT2XV_DU_BU", "XVREPLVE0_B", "XVREPLVE0_Q"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v32i8 LASX256:$xj)),
++            (!cast<LAInst>(Inst) LASX256:$xj)>;
++foreach Inst = ["XVEXTH_W_H", "XVEXTH_WU_HU", "XVMSKLTZ_H",
++                "XVCLO_H", "XVCLZ_H", "XVFCVTL_S_H", "XVFCVTH_S_H",
++                "VEXT2XV_W_H", "VEXT2XV_WU_HU", "VEXT2XV_D_H",
++                "VEXT2XV_DU_HU", "XVREPLVE0_H"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v16i16 LASX256:$xj)),
++            (!cast<LAInst>(Inst) LASX256:$xj)>;
++foreach Inst = ["XVEXTH_D_W", "XVEXTH_DU_WU", "XVMSKLTZ_W",
++                "XVCLO_W", "XVCLZ_W", "XVFFINT_S_W", "XVFFINT_S_WU",
++                "XVFFINTL_D_W", "XVFFINTH_D_W",
++                "VEXT2XV_D_W", "VEXT2XV_DU_WU", "XVREPLVE0_W"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v8i32 LASX256:$xj)),
++            (!cast<LAInst>(Inst) LASX256:$xj)>;
++foreach Inst = ["XVEXTH_Q_D", "XVEXTH_QU_DU", "XVMSKLTZ_D",
++                "XVEXTL_Q_D", "XVEXTL_QU_DU",
++                "XVCLO_D", "XVCLZ_D", "XVFFINT_D_L", "XVFFINT_D_LU",
++                "XVREPLVE0_D"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4i64 LASX256:$xj)),
++            (!cast<LAInst>(Inst) LASX256:$xj)>;
++
++// Pat<(Intrinsic timm:$imm)
++//     (LAInst timm:$imm)>;
++def : Pat<(int_loongarch_lasx_xvldi timm:$imm),
++          (XVLDI (to_valide_timm timm:$imm))>;
++foreach Inst = ["XVREPLI_B", "XVREPLI_H", "XVREPLI_W", "XVREPLI_D"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret timm:$imm),
++            (!cast<LAInst>("Pseudo"#Inst) (to_valide_timm timm:$imm))>;
++
++// vty: v32i8/v16i16/v8i32/v4i64
++// Pat<(Intrinsic vty:$xj, timm:$imm)
++//     (LAInst vty:$xj, timm:$imm)>;
++foreach Inst = ["XVSAT_B", "XVSAT_BU", "XVNORI_B", "XVROTRI_B", "XVSLLWIL_H_B",
++                "XVSLLWIL_HU_BU", "XVSRLRI_B", "XVSRARI_B",
++                "XVSEQI_B", "XVSLEI_B", "XVSLEI_BU", "XVSLTI_B", "XVSLTI_BU",
++                "XVREPL128VEI_B", "XVBSLL_V", "XVBSRL_V", "XVSHUF4I_B"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v32i8 LASX256:$xj), timm:$imm),
++            (!cast<LAInst>(Inst) LASX256:$xj, (to_valide_timm timm:$imm))>;
++foreach Inst = ["XVSAT_H", "XVSAT_HU", "XVROTRI_H", "XVSLLWIL_W_H",
++                "XVSLLWIL_WU_HU", "XVSRLRI_H", "XVSRARI_H",
++                "XVSEQI_H", "XVSLEI_H", "XVSLEI_HU", "XVSLTI_H", "XVSLTI_HU",
++                "XVREPL128VEI_H", "XVSHUF4I_H"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v16i16 LASX256:$xj), timm:$imm),
++            (!cast<LAInst>(Inst) LASX256:$xj, (to_valide_timm timm:$imm))>;
++foreach Inst = ["XVSAT_W", "XVSAT_WU", "XVROTRI_W", "XVSLLWIL_D_W",
++                "XVSLLWIL_DU_WU", "XVSRLRI_W", "XVSRARI_W",
++                "XVSEQI_W", "XVSLEI_W", "XVSLEI_WU", "XVSLTI_W", "XVSLTI_WU",
++                "XVREPL128VEI_W", "XVSHUF4I_W", "XVPICKVE_W"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v8i32 LASX256:$xj), timm:$imm),
++            (!cast<LAInst>(Inst) LASX256:$xj, (to_valide_timm timm:$imm))>;
++foreach Inst = ["XVSAT_D", "XVSAT_DU", "XVROTRI_D", "XVSRLRI_D", "XVSRARI_D",
++                "XVSEQI_D", "XVSLEI_D", "XVSLEI_DU", "XVSLTI_D", "XVSLTI_DU",
++                "XVPICKVE2GR_D", "XVPICKVE2GR_DU",
++                "XVREPL128VEI_D", "XVPERMI_D", "XVPICKVE_D"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4i64 LASX256:$xj), timm:$imm),
++            (!cast<LAInst>(Inst) LASX256:$xj, (to_valide_timm timm:$imm))>;
++
++// vty: v32i8/v16i16/v8i32/v4i64
++// Pat<(Intrinsic vty:$xd, vty:$xj, timm:$imm)
++//     (LAInst vty:$xd, vty:$xj, timm:$imm)>;
++foreach Inst = ["XVSRLNI_B_H", "XVSRANI_B_H", "XVSRLRNI_B_H", "XVSRARNI_B_H",
++                "XVSSRLNI_B_H", "XVSSRANI_B_H", "XVSSRLNI_BU_H", "XVSSRANI_BU_H",
++                "XVSSRLRNI_B_H", "XVSSRARNI_B_H", "XVSSRLRNI_BU_H", "XVSSRARNI_BU_H",
++                "XVFRSTPI_B", "XVBITSELI_B", "XVEXTRINS_B", "XVPERMI_Q"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v32i8 LASX256:$xd), (v32i8 LASX256:$xj), timm:$imm),
++            (!cast<LAInst>(Inst) LASX256:$xd, LASX256:$xj,
++               (to_valide_timm timm:$imm))>;
++foreach Inst = ["XVSRLNI_H_W", "XVSRANI_H_W", "XVSRLRNI_H_W", "XVSRARNI_H_W",
++                "XVSSRLNI_H_W", "XVSSRANI_H_W", "XVSSRLNI_HU_W", "XVSSRANI_HU_W",
++                "XVSSRLRNI_H_W", "XVSSRARNI_H_W", "XVSSRLRNI_HU_W", "XVSSRARNI_HU_W",
++                "XVFRSTPI_H", "XVEXTRINS_H"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v16i16 LASX256:$xd), (v16i16 LASX256:$xj), timm:$imm),
++            (!cast<LAInst>(Inst) LASX256:$xd, LASX256:$xj,
++               (to_valide_timm timm:$imm))>;
++foreach Inst = ["XVSRLNI_W_D", "XVSRANI_W_D", "XVSRLRNI_W_D", "XVSRARNI_W_D",
++                "XVSSRLNI_W_D", "XVSSRANI_W_D", "XVSSRLNI_WU_D", "XVSSRANI_WU_D",
++                "XVSSRLRNI_W_D", "XVSSRARNI_W_D", "XVSSRLRNI_WU_D", "XVSSRARNI_WU_D",
++                "XVPERMI_W", "XVEXTRINS_W", "XVINSVE0_W"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v8i32 LASX256:$xd), (v8i32 LASX256:$xj), timm:$imm),
++            (!cast<LAInst>(Inst) LASX256:$xd, LASX256:$xj,
++               (to_valide_timm timm:$imm))>;
++foreach Inst = ["XVSRLNI_D_Q", "XVSRANI_D_Q", "XVSRLRNI_D_Q", "XVSRARNI_D_Q",
++                "XVSSRLNI_D_Q", "XVSSRANI_D_Q", "XVSSRLNI_DU_Q", "XVSSRANI_DU_Q",
++                "XVSSRLRNI_D_Q", "XVSSRARNI_D_Q", "XVSSRLRNI_DU_Q", "XVSSRARNI_DU_Q",
++                "XVSHUF4I_D", "XVEXTRINS_D", "XVINSVE0_D"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v4i64 LASX256:$xd), (v4i64 LASX256:$xj), timm:$imm),
++            (!cast<LAInst>(Inst) LASX256:$xd, LASX256:$xj,
++               (to_valide_timm timm:$imm))>;
++
++// vty: v32i8/v16i16/v8i32/v4i64
++// Pat<(Intrinsic vty:$xd, vty:$xj, vty:$xk),
++//     (LAInst vty:$xd, vty:$xj, vty:$xk)>;
++foreach Inst = ["XVFRSTP_B", "XVBITSEL_V", "XVSHUF_B"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v32i8 LASX256:$xd), (v32i8 LASX256:$xj), (v32i8 LASX256:$xk)),
++            (!cast<LAInst>(Inst) LASX256:$xd, LASX256:$xj, LASX256:$xk)>;
++foreach Inst = ["XVFRSTP_H", "XVSHUF_H"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v16i16 LASX256:$xd), (v16i16 LASX256:$xj), (v16i16 LASX256:$xk)),
++            (!cast<LAInst>(Inst) LASX256:$xd, LASX256:$xj, LASX256:$xk)>;
++def : Pat<(int_loongarch_lasx_xvshuf_w (v8i32 LASX256:$xd), (v8i32 LASX256:$xj),
++                                     (v8i32 LASX256:$xk)),
++          (XVSHUF_W LASX256:$xd, LASX256:$xj, LASX256:$xk)>;
++def : Pat<(int_loongarch_lasx_xvshuf_d (v4i64 LASX256:$xd), (v4i64 LASX256:$xj),
++                                     (v4i64 LASX256:$xk)),
++          (XVSHUF_D LASX256:$xd, LASX256:$xj, LASX256:$xk)>;
++
++// vty: v8f32/v4f64
++// Pat<(Intrinsic vty:$xj, vty:$xk, vty:$xa),
++//     (LAInst vty:$xj, vty:$xk, vty:$xa)>;
++foreach Inst = ["XVFMSUB_S", "XVFNMADD_S", "XVFNMSUB_S"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v8f32 LASX256:$xj), (v8f32 LASX256:$xk), (v8f32 LASX256:$xa)),
++            (!cast<LAInst>(Inst) LASX256:$xj, LASX256:$xk, LASX256:$xa)>;
++foreach Inst = ["XVFMSUB_D", "XVFNMADD_D", "XVFNMSUB_D"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v4f64 LASX256:$xj), (v4f64 LASX256:$xk), (v4f64 LASX256:$xa)),
++            (!cast<LAInst>(Inst) LASX256:$xj, LASX256:$xk, LASX256:$xa)>;
++
++// vty: v8f32/v4f64
++// Pat<(Intrinsic vty:$xj, vty:$xk),
++//     (LAInst vty:$xj, vty:$xk)>;
++foreach Inst = ["XVFMAX_S", "XVFMIN_S", "XVFMAXA_S", "XVFMINA_S", "XVFCVT_H_S",
++                "XVFCMP_CAF_S", "XVFCMP_CUN_S", "XVFCMP_CEQ_S", "XVFCMP_CUEQ_S",
++                "XVFCMP_CLT_S", "XVFCMP_CULT_S", "XVFCMP_CLE_S", "XVFCMP_CULE_S",
++                "XVFCMP_CNE_S", "XVFCMP_COR_S", "XVFCMP_CUNE_S",
++                "XVFCMP_SAF_S", "XVFCMP_SUN_S", "XVFCMP_SEQ_S", "XVFCMP_SUEQ_S",
++                "XVFCMP_SLT_S", "XVFCMP_SULT_S", "XVFCMP_SLE_S", "XVFCMP_SULE_S",
++                "XVFCMP_SNE_S", "XVFCMP_SOR_S", "XVFCMP_SUNE_S"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v8f32 LASX256:$xj), (v8f32 LASX256:$xk)),
++            (!cast<LAInst>(Inst) LASX256:$xj, LASX256:$xk)>;
++foreach Inst = ["XVFMAX_D", "XVFMIN_D", "XVFMAXA_D", "XVFMINA_D", "XVFCVT_S_D",
++                "XVFTINTRNE_W_D", "XVFTINTRZ_W_D", "XVFTINTRP_W_D", "XVFTINTRM_W_D",
++                "XVFTINT_W_D",
++                "XVFCMP_CAF_D", "XVFCMP_CUN_D", "XVFCMP_CEQ_D", "XVFCMP_CUEQ_D",
++                "XVFCMP_CLT_D", "XVFCMP_CULT_D", "XVFCMP_CLE_D", "XVFCMP_CULE_D",
++                "XVFCMP_CNE_D", "XVFCMP_COR_D", "XVFCMP_CUNE_D",
++                "XVFCMP_SAF_D", "XVFCMP_SUN_D", "XVFCMP_SEQ_D", "XVFCMP_SUEQ_D",
++                "XVFCMP_SLT_D", "XVFCMP_SULT_D", "XVFCMP_SLE_D", "XVFCMP_SULE_D",
++                "XVFCMP_SNE_D", "XVFCMP_SOR_D", "XVFCMP_SUNE_D"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret
++               (v4f64 LASX256:$xj), (v4f64 LASX256:$xk)),
++            (!cast<LAInst>(Inst) LASX256:$xj, LASX256:$xk)>;
++
++// vty: v8f32/v4f64
++// Pat<(Intrinsic vty:$xj),
++//     (LAInst vty:$xj)>;
++foreach Inst = ["XVFLOGB_S", "XVFCLASS_S", "XVFSQRT_S", "XVFRECIP_S", "XVFRSQRT_S",
++                "XVFRINT_S", "XVFCVTL_D_S", "XVFCVTH_D_S",
++                "XVFRINTRNE_S", "XVFRINTRZ_S", "XVFRINTRP_S", "XVFRINTRM_S",
++                "XVFTINTRNE_W_S", "XVFTINTRZ_W_S", "XVFTINTRP_W_S", "XVFTINTRM_W_S",
++                "XVFTINT_W_S", "XVFTINTRZ_WU_S", "XVFTINT_WU_S",
++                "XVFTINTRNEL_L_S", "XVFTINTRNEH_L_S", "XVFTINTRZL_L_S",
++                "XVFTINTRZH_L_S", "XVFTINTRPL_L_S", "XVFTINTRPH_L_S",
++                "XVFTINTRML_L_S", "XVFTINTRMH_L_S", "XVFTINTL_L_S",
++                "XVFTINTH_L_S"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v8f32 LASX256:$xj)),
++            (!cast<LAInst>(Inst) LASX256:$xj)>;
++foreach Inst = ["XVFLOGB_D", "XVFCLASS_D", "XVFSQRT_D", "XVFRECIP_D", "XVFRSQRT_D",
++                "XVFRINT_D",
++                "XVFRINTRNE_D", "XVFRINTRZ_D", "XVFRINTRP_D", "XVFRINTRM_D",
++                "XVFTINTRNE_L_D", "XVFTINTRZ_L_D", "XVFTINTRP_L_D", "XVFTINTRM_L_D",
++                "XVFTINT_L_D", "XVFTINTRZ_LU_D", "XVFTINT_LU_D"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4f64 LASX256:$xj)),
++            (!cast<LAInst>(Inst) LASX256:$xj)>;
++
++def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm),
++          (XVPICKVE_W v8f32:$xj, (to_valide_timm timm:$imm))>;
++def : Pat<(int_loongarch_lasx_xvpickve_d_f v4f64:$xj, timm:$imm),
++          (XVPICKVE_D v4f64:$xj, (to_valide_timm timm:$imm))>;
++
++// load
++def : Pat<(int_loongarch_lasx_xvld GPR:$rj, timm:$imm),
++          (XVLD GPR:$rj, (to_valide_timm timm:$imm))>;
++def : Pat<(int_loongarch_lasx_xvldx GPR:$rj, GPR:$rk),
++          (XVLDX GPR:$rj, GPR:$rk)>;
++
++def : Pat<(int_loongarch_lasx_xvldrepl_b GPR:$rj, timm:$imm),
++          (XVLDREPL_B GPR:$rj, (to_valide_timm timm:$imm))>;
++def : Pat<(int_loongarch_lasx_xvldrepl_h GPR:$rj, timm:$imm),
++          (XVLDREPL_H GPR:$rj, (to_valide_timm timm:$imm))>;
++def : Pat<(int_loongarch_lasx_xvldrepl_w GPR:$rj, timm:$imm),
++          (XVLDREPL_W GPR:$rj, (to_valide_timm timm:$imm))>;
++def : Pat<(int_loongarch_lasx_xvldrepl_d GPR:$rj, timm:$imm),
++          (XVLDREPL_D GPR:$rj, (to_valide_timm timm:$imm))>;
++
++// store
++def : Pat<(int_loongarch_lasx_xvst LASX256:$xd, GPR:$rj, timm:$imm),
++          (XVST LASX256:$xd, GPR:$rj, (to_valide_timm timm:$imm))>;
++def : Pat<(int_loongarch_lasx_xvstx LASX256:$xd, GPR:$rj, GPR:$rk),
++          (XVSTX LASX256:$xd, GPR:$rj, GPR:$rk)>;
++
++def : Pat<(int_loongarch_lasx_xvstelm_b v32i8:$xd, GPR:$rj, timm:$imm, timm:$idx),
++          (XVSTELM_B v32i8:$xd, GPR:$rj, (to_valide_timm timm:$imm),
++                    (to_valide_timm timm:$idx))>;
++def : Pat<(int_loongarch_lasx_xvstelm_h v16i16:$xd, GPR:$rj, timm:$imm, timm:$idx),
++          (XVSTELM_H v16i16:$xd, GPR:$rj, (to_valide_timm timm:$imm),
++                    (to_valide_timm timm:$idx))>;
++def : Pat<(int_loongarch_lasx_xvstelm_w v8i32:$xd, GPR:$rj, timm:$imm, timm:$idx),
++          (XVSTELM_W v8i32:$xd, GPR:$rj, (to_valide_timm timm:$imm),
++                    (to_valide_timm timm:$idx))>;
++def : Pat<(int_loongarch_lasx_xvstelm_d v4i64:$xd, GPR:$rj, timm:$imm, timm:$idx),
++          (XVSTELM_D v4i64:$xd, GPR:$rj, (to_valide_timm timm:$imm),
++                    (to_valide_timm timm:$idx))>;
++
+ } // Predicates = [HasExtLASX]
+-- 
+2.20.1
+
+
+From 76928242b8b8e6228d1b1ec80c69b61c94d6ec79 Mon Sep 17 00:00:00 2001
+From: chenli <chenli@loongson.cn>
+Date: Sat, 19 Aug 2023 17:10:41 +0800
+Subject: [PATCH 04/35] [LoongArch] Add LSX intrinsic testcases
+
+Depends on D155829
+
+Reviewed By: SixWeining
+
+Differential Revision: https://reviews.llvm.org/D155834
+
+(cherry picked from commit f3aa4416319aed198841401c6c9dc2e49afe2507)
+---
+ .../CodeGen/LoongArch/lsx/intrinsic-absd.ll   |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-add.ll    |  62 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-adda.ll   |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-addi.ll   |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-addw.ll   | 290 ++++++++++
+ .../CodeGen/LoongArch/lsx/intrinsic-and.ll    |  14 +
+ .../CodeGen/LoongArch/lsx/intrinsic-andi.ll   |  14 +
+ .../CodeGen/LoongArch/lsx/intrinsic-andn.ll   |  14 +
+ .../CodeGen/LoongArch/lsx/intrinsic-avg.ll    |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-avgr.ll   |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-bitclr.ll |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-bitrev.ll |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-bitsel.ll |  14 +
+ .../LoongArch/lsx/intrinsic-bitseli.ll        |  14 +
+ .../CodeGen/LoongArch/lsx/intrinsic-bitset.ll |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-bsll.ll   |  14 +
+ .../CodeGen/LoongArch/lsx/intrinsic-bsrl.ll   |  14 +
+ .../CodeGen/LoongArch/lsx/intrinsic-clo.ll    |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-clz.ll    |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-div.ll    |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-exth.ll   |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-extl.ll   |  26 +
+ .../LoongArch/lsx/intrinsic-extrins.ll        |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-fadd.ll   |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-fclass.ll |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-fcmp.ll   | 530 ++++++++++++++++++
+ .../CodeGen/LoongArch/lsx/intrinsic-fcvt.ll   |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-fcvth.ll  |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-fcvtl.ll  |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-fdiv.ll   |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-ffint.ll  |  86 +++
+ .../CodeGen/LoongArch/lsx/intrinsic-flogb.ll  |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-fmadd.ll  |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-fmax.ll   |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-fmaxa.ll  |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-fmin.ll   |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-fmina.ll  |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-fmsub.ll  |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-fmul.ll   |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-fnmadd.ll |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-fnmsub.ll |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-frecip.ll |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-frint.ll  | 122 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-frsqrt.ll |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-frstp.ll  |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-fsqrt.ll  |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-fsub.ll   |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-ftint.ll  | 350 ++++++++++++
+ .../CodeGen/LoongArch/lsx/intrinsic-haddw.ll  |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-hsubw.ll  |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-ilv.ll    |  98 ++++
+ .../LoongArch/lsx/intrinsic-insgr2vr.ll       |  54 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-ld.ll     |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-ldi.ll    |  62 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-ldrepl.ll |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-madd.ll   |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-maddw.ll  | 290 ++++++++++
+ .../CodeGen/LoongArch/lsx/intrinsic-max.ll    | 194 +++++++
+ .../CodeGen/LoongArch/lsx/intrinsic-min.ll    | 194 +++++++
+ .../CodeGen/LoongArch/lsx/intrinsic-mod.ll    |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-mskgez.ll |  14 +
+ .../CodeGen/LoongArch/lsx/intrinsic-mskltz.ll |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-msknz.ll  |  14 +
+ .../CodeGen/LoongArch/lsx/intrinsic-msub.ll   |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-muh.ll    |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-mul.ll    |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-mulw.ll   | 290 ++++++++++
+ .../CodeGen/LoongArch/lsx/intrinsic-neg.ll    |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-nor.ll    |  14 +
+ .../CodeGen/LoongArch/lsx/intrinsic-nori.ll   |  14 +
+ .../CodeGen/LoongArch/lsx/intrinsic-or.ll     |  14 +
+ .../CodeGen/LoongArch/lsx/intrinsic-ori.ll    |  14 +
+ .../CodeGen/LoongArch/lsx/intrinsic-orn.ll    |  14 +
+ .../CodeGen/LoongArch/lsx/intrinsic-pack.ll   |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-pcnt.ll   |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-permi.ll  |  14 +
+ .../CodeGen/LoongArch/lsx/intrinsic-pick.ll   |  98 ++++
+ .../LoongArch/lsx/intrinsic-pickve2gr.ll      |  98 ++++
+ .../LoongArch/lsx/intrinsic-replgr2vr.ll      |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-replve.ll |  50 ++
+ .../LoongArch/lsx/intrinsic-replvei.ll        |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-rotr.ll   |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-sadd.ll   |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-sat.ll    |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-seq.ll    |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-set.ll    |  38 ++
+ .../LoongArch/lsx/intrinsic-setallnez.ll      |  74 +++
+ .../LoongArch/lsx/intrinsic-setanyeqz.ll      |  74 +++
+ .../CodeGen/LoongArch/lsx/intrinsic-shuf.ll   |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-shuf4i.ll |  50 ++
+ .../LoongArch/lsx/intrinsic-signcov.ll        |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-sle.ll    | 194 +++++++
+ .../CodeGen/LoongArch/lsx/intrinsic-sll.ll    |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-sllwil.ll |  74 +++
+ .../CodeGen/LoongArch/lsx/intrinsic-slt.ll    | 194 +++++++
+ .../CodeGen/LoongArch/lsx/intrinsic-sra.ll    |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-sran.ll   |  38 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-srani.ll  |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-srar.ll   |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-srarn.ll  |  38 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-srarni.ll |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-srl.ll    |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-srln.ll   |  38 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-srlni.ll  |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-srlr.ll   |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-srlrn.ll  |  38 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-srlrni.ll |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-ssran.ll  |  74 +++
+ .../CodeGen/LoongArch/lsx/intrinsic-ssrani.ll |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-ssrarn.ll |  74 +++
+ .../LoongArch/lsx/intrinsic-ssrarni.ll        |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-ssrln.ll  |  74 +++
+ .../CodeGen/LoongArch/lsx/intrinsic-ssrlni.ll |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-ssrlrn.ll |  74 +++
+ .../LoongArch/lsx/intrinsic-ssrlrni.ll        |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-ssub.ll   |  98 ++++
+ .../CodeGen/LoongArch/lsx/intrinsic-st.ll     |  26 +
+ .../CodeGen/LoongArch/lsx/intrinsic-stelm.ll  |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-sub.ll    |  62 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-subi.ll   |  50 ++
+ .../CodeGen/LoongArch/lsx/intrinsic-subw.ll   | 194 +++++++
+ .../CodeGen/LoongArch/lsx/intrinsic-xor.ll    |  14 +
+ .../CodeGen/LoongArch/lsx/intrinsic-xori.ll   |  14 +
+ 123 files changed, 8902 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-absd.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-add.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-adda.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-addi.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-addw.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-and.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-andi.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-andn.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-avg.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-avgr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitclr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitrev.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitsel.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitseli.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitset.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsll.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsrl.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-clo.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-clz.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-div.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-exth.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-extl.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-extrins.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fadd.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fclass.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcmp.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcvt.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcvth.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcvtl.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fdiv.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ffint.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-flogb.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmadd.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmax.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmaxa.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmin.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmina.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmsub.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmul.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fnmadd.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fnmsub.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecip.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-frint.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrt.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-frstp.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fsqrt.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-fsub.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ftint.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-haddw.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-hsubw.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ilv.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-insgr2vr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ld.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldi.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldrepl.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-madd.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-maddw.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-max.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-min.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-mod.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-mskgez.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-mskltz.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-msknz.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-msub.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-muh.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-mul.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-mulw.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-neg.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-nor.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-nori.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-or.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ori.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-orn.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-pack.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-pcnt.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-permi.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-pick.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-replve.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-replvei.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-rotr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sadd.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sat.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-seq.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-set.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-setallnez.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-setanyeqz.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-shuf.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-shuf4i.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-signcov.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sle.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sll.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sllwil.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-slt.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sra.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sran.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srani.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srar.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srarn.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srarni.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srl.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srln.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlni.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlrn.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlrni.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssran.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrani.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrarn.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrarni.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrln.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlni.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlrn.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlrni.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssub.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-st.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-stelm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sub.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-subi.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-subw.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-xor.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-xori.ll
+
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-absd.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-absd.ll
+new file mode 100644
+index 000000000000..811d9d712de4
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-absd.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vabsd.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vabsd_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vabsd_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vabsd.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vabsd.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vabsd.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vabsd_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vabsd_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vabsd.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vabsd.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vabsd.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vabsd_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vabsd_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vabsd.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vabsd.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vabsd.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vabsd_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vabsd_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vabsd.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vabsd.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vabsd.bu(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vabsd_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vabsd_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vabsd.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vabsd.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vabsd.hu(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vabsd_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vabsd_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vabsd.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vabsd.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vabsd.wu(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vabsd_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vabsd_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vabsd.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vabsd.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vabsd.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vabsd_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vabsd_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vabsd.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vabsd.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-add.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-add.ll
+new file mode 100644
+index 000000000000..fac16c8308da
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-add.ll
+@@ -0,0 +1,62 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vadd.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vadd_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vadd_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vadd.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vadd.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vadd.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vadd_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vadd_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vadd.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vadd.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vadd.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vadd_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vadd_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vadd.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vadd.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vadd.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vadd_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vadd_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vadd.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vadd.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vadd.q(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vadd_q(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vadd_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vadd.q $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vadd.q(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-adda.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-adda.ll
+new file mode 100644
+index 000000000000..79be0a184bfb
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-adda.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vadda.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vadda_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vadda_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vadda.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vadda.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vadda.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vadda_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vadda_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vadda.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vadda.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vadda.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vadda_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vadda_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vadda.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vadda.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vadda.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vadda_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vadda_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vadda.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vadda.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-addi.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-addi.ll
+new file mode 100644
+index 000000000000..b9134e0724fe
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-addi.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vaddi.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vaddi_bu(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vaddi_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddi.bu $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vaddi.bu(<16 x i8> %va, i32 31)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vaddi.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vaddi_hu(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vaddi_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddi.hu $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vaddi.hu(<8 x i16> %va, i32 31)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vaddi.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vaddi_wu(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vaddi_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddi.wu $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vaddi.wu(<4 x i32> %va, i32 31)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vaddi.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vaddi_du(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vaddi_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddi.du $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vaddi.du(<2 x i64> %va, i32 31)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-addw.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-addw.ll
+new file mode 100644
+index 000000000000..086e3bec12d2
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-addw.ll
+@@ -0,0 +1,290 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <8 x i16> @llvm.loongarch.lsx.vaddwev.h.b(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vaddwev_h_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwev_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwev.h.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vaddwev.w.h(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vaddwev_w_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwev_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwev.w.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vaddwev.d.w(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vaddwev_d_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwev_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwev.d.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vaddwev.q.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vaddwev_q_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwev_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwev.q.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vaddwev_h_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwev_h_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwev.h.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vaddwev_w_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwev_w_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwev.w.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vaddwev_d_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwev_d_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwev.d.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vaddwev_q_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwev_q_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwev.q.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu.b(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vaddwev_h_bu_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwev_h_bu_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwev.h.bu.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu.h(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vaddwev_w_hu_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwev_w_hu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwev.w.hu.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu.w(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vaddwev_d_wu_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwev_d_wu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwev.d.wu.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vaddwev_q_du_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwev_q_du_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwev.q.du.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vaddwod.h.b(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vaddwod_h_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwod_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwod.h.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vaddwod.w.h(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vaddwod_w_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwod_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwod.w.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vaddwod.d.w(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vaddwod_d_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwod_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwod.d.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vaddwod.q.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vaddwod_q_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwod_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwod.q.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vaddwod_h_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwod_h_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwod.h.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vaddwod_w_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwod_w_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwod.w.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vaddwod_d_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwod_d_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwod.d.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vaddwod_q_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwod_q_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwod.q.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu.b(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vaddwod_h_bu_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwod_h_bu_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwod.h.bu.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu.h(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vaddwod_w_hu_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwod_w_hu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwod.w.hu.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu.w(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vaddwod_d_wu_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwod_d_wu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwod.d.wu.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vaddwod_q_du_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vaddwod_q_du_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vaddwod.q.du.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-and.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-and.ll
+new file mode 100644
+index 000000000000..77496239c3a9
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-and.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vand.v(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vand_v(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vand_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vand.v(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-andi.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-andi.ll
+new file mode 100644
+index 000000000000..9a1c38a641d0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-andi.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vandi.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vandi_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vandi_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vandi.b $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vandi.b(<16 x i8> %va, i32 1)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-andn.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-andn.ll
+new file mode 100644
+index 000000000000..b08c759ecc32
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-andn.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vandn.v(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vandn_v(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vandn_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vandn.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vandn.v(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-avg.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-avg.ll
+new file mode 100644
+index 000000000000..fb0861f4cd5e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-avg.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vavg.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vavg_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vavg_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vavg.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vavg.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vavg.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vavg_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vavg_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vavg.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vavg.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vavg.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vavg_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vavg_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vavg.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vavg.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vavg.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vavg_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vavg_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vavg.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vavg.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vavg.bu(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vavg_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vavg_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vavg.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vavg.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vavg.hu(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vavg_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vavg_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vavg.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vavg.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vavg.wu(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vavg_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vavg_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vavg.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vavg.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vavg.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vavg_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vavg_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vavg.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vavg.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-avgr.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-avgr.ll
+new file mode 100644
+index 000000000000..8bf7d0ed8817
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-avgr.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vavgr.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vavgr_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vavgr_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vavgr.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vavgr.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vavgr.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vavgr_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vavgr_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vavgr.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vavgr.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vavgr.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vavgr_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vavgr_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vavgr.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vavgr.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vavgr.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vavgr_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vavgr_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vavgr.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vavgr.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vavgr.bu(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vavgr_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vavgr_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vavgr.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vavgr.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vavgr.hu(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vavgr_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vavgr_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vavgr.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vavgr.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vavgr.wu(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vavgr_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vavgr_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vavgr.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vavgr.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vavgr.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vavgr_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vavgr_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vavgr.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vavgr.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitclr.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitclr.ll
+new file mode 100644
+index 000000000000..f5fba6dbb141
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitclr.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbitclr.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vbitclr_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vbitclr_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitclr.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitclr.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vbitclr.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vbitclr_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vbitclr_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitclr.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vbitclr.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vbitclr.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vbitclr_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vbitclr_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitclr.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vbitclr.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vbitclr.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vbitclr_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vbitclr_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitclr.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vbitclr.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vbitclri.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vbitclri_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vbitclri_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitclri.b $vr0, $vr0, 7
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitclri.b(<16 x i8> %va, i32 7)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vbitclri.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vbitclri_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vbitclri_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitclri.h $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vbitclri.h(<8 x i16> %va, i32 15)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vbitclri.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vbitclri_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vbitclri_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitclri.w $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vbitclri.w(<4 x i32> %va, i32 31)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vbitclri.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vbitclri_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vbitclri_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitclri.d $vr0, $vr0, 63
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vbitclri.d(<2 x i64> %va, i32 63)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitrev.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitrev.ll
+new file mode 100644
+index 000000000000..ad56e88fdb88
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitrev.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbitrev.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vbitrev_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vbitrev_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitrev.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitrev.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vbitrev.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vbitrev_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vbitrev_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitrev.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vbitrev.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vbitrev.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vbitrev_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vbitrev_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitrev.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vbitrev.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vbitrev.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vbitrev_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vbitrev_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitrev.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vbitrev.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vbitrevi.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vbitrevi_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vbitrevi_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitrevi.b $vr0, $vr0, 7
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitrevi.b(<16 x i8> %va, i32 7)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vbitrevi.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vbitrevi_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vbitrevi_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitrevi.h $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vbitrevi.h(<8 x i16> %va, i32 15)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vbitrevi.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vbitrevi_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vbitrevi_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitrevi.w $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vbitrevi.w(<4 x i32> %va, i32 31)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vbitrevi.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vbitrevi_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vbitrevi_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitrevi.d $vr0, $vr0, 63
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vbitrevi.d(<2 x i64> %va, i32 63)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitsel.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitsel.ll
+new file mode 100644
+index 000000000000..4b4b5ff1fc8c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitsel.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbitsel.v(<16 x i8>, <16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vbitsel_v(<16 x i8> %va, <16 x i8> %vb, <16 x i8> %vc) nounwind {
++; CHECK-LABEL: lsx_vbitsel_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitsel.v $vr0, $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitsel.v(<16 x i8> %va, <16 x i8> %vb, <16 x i8> %vc)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitseli.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitseli.ll
+new file mode 100644
+index 000000000000..28d342b5c378
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitseli.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbitseli.b(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vbitseli_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vbitseli_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitseli.b $vr0, $vr1, 255
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitseli.b(<16 x i8> %va, <16 x i8> %vb, i32 255)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitset.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitset.ll
+new file mode 100644
+index 000000000000..75d98e6f8bce
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitset.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbitset.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vbitset_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vbitset_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitset.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitset.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vbitset.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vbitset_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vbitset_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitset.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vbitset.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vbitset.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vbitset_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vbitset_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitset.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vbitset.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vbitset.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vbitset_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vbitset_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitset.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vbitset.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vbitseti.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vbitseti_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vbitseti_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitseti.b $vr0, $vr0, 7
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitseti.b(<16 x i8> %va, i32 7)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vbitseti.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vbitseti_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vbitseti_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitseti.h $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vbitseti.h(<8 x i16> %va, i32 15)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vbitseti.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vbitseti_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vbitseti_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitseti.w $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vbitseti.w(<4 x i32> %va, i32 31)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vbitseti.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vbitseti_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vbitseti_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbitseti.d $vr0, $vr0, 63
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vbitseti.d(<2 x i64> %va, i32 63)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsll.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsll.ll
+new file mode 100644
+index 000000000000..e7eb1cfcb407
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsll.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbsll.v(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vbsll_v(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vbsll_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbsll.v $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbsll.v(<16 x i8> %va, i32 31)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsrl.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsrl.ll
+new file mode 100644
+index 000000000000..fe0565297641
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsrl.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbsrl.v(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vbsrl_v(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vbsrl_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vbsrl.v $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbsrl.v(<16 x i8> %va, i32 31)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-clo.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-clo.ll
+new file mode 100644
+index 000000000000..c581109f3fd0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-clo.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vclo.b(<16 x i8>)
++
++define <16 x i8> @lsx_vclo_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vclo_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vclo.b $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vclo.b(<16 x i8> %va)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vclo.h(<8 x i16>)
++
++define <8 x i16> @lsx_vclo_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vclo_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vclo.h $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vclo.h(<8 x i16> %va)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vclo.w(<4 x i32>)
++
++define <4 x i32> @lsx_vclo_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vclo_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vclo.w $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vclo.w(<4 x i32> %va)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vclo.d(<2 x i64>)
++
++define <2 x i64> @lsx_vclo_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vclo_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vclo.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vclo.d(<2 x i64> %va)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-clz.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-clz.ll
+new file mode 100644
+index 000000000000..25c37b64349b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-clz.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vclz.b(<16 x i8>)
++
++define <16 x i8> @lsx_vclz_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vclz_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vclz.b $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vclz.b(<16 x i8> %va)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vclz.h(<8 x i16>)
++
++define <8 x i16> @lsx_vclz_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vclz_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vclz.h $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vclz.h(<8 x i16> %va)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vclz.w(<4 x i32>)
++
++define <4 x i32> @lsx_vclz_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vclz_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vclz.w $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vclz.w(<4 x i32> %va)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vclz.d(<2 x i64>)
++
++define <2 x i64> @lsx_vclz_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vclz_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vclz.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vclz.d(<2 x i64> %va)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-div.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-div.ll
+new file mode 100644
+index 000000000000..53166e84d269
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-div.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vdiv.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vdiv_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vdiv_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vdiv.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vdiv.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vdiv.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vdiv_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vdiv_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vdiv.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vdiv.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vdiv.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vdiv_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vdiv_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vdiv.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vdiv.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vdiv.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vdiv_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vdiv_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vdiv.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vdiv.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vdiv.bu(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vdiv_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vdiv_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vdiv.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vdiv.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vdiv.hu(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vdiv_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vdiv_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vdiv.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vdiv.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vdiv.wu(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vdiv_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vdiv_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vdiv.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vdiv.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vdiv.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vdiv_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vdiv_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vdiv.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vdiv.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-exth.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-exth.ll
+new file mode 100644
+index 000000000000..2f3e891a9eef
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-exth.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <8 x i16> @llvm.loongarch.lsx.vexth.h.b(<16 x i8>)
++
++define <8 x i16> @lsx_vexth_h_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vexth_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vexth.h.b $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vexth.h.b(<16 x i8> %va)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vexth.w.h(<8 x i16>)
++
++define <4 x i32> @lsx_vexth_w_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vexth_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vexth.w.h $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vexth.w.h(<8 x i16> %va)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vexth.d.w(<4 x i32>)
++
++define <2 x i64> @lsx_vexth_d_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vexth_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vexth.d.w $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vexth.d.w(<4 x i32> %va)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vexth.q.d(<2 x i64>)
++
++define <2 x i64> @lsx_vexth_q_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vexth_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vexth.q.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vexth.q.d(<2 x i64> %va)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vexth.hu.bu(<16 x i8>)
++
++define <8 x i16> @lsx_vexth_hu_bu(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vexth_hu_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vexth.hu.bu $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vexth.hu.bu(<16 x i8> %va)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vexth.wu.hu(<8 x i16>)
++
++define <4 x i32> @lsx_vexth_wu_hu(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vexth_wu_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vexth.wu.hu $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vexth.wu.hu(<8 x i16> %va)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vexth.du.wu(<4 x i32>)
++
++define <2 x i64> @lsx_vexth_du_wu(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vexth_du_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vexth.du.wu $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vexth.du.wu(<4 x i32> %va)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vexth.qu.du(<2 x i64>)
++
++define <2 x i64> @lsx_vexth_qu_du(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vexth_qu_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vexth.qu.du $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vexth.qu.du(<2 x i64> %va)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-extl.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-extl.ll
+new file mode 100644
+index 000000000000..cbf19e2a3919
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-extl.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <2 x i64> @llvm.loongarch.lsx.vextl.q.d(<2 x i64>)
++
++define <2 x i64> @lsx_vextl_q_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vextl_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vextl.q.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vextl.q.d(<2 x i64> %va)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vextl.qu.du(<2 x i64>)
++
++define <2 x i64> @lsx_vextl_qu_du(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vextl_qu_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vextl.qu.du $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vextl.qu.du(<2 x i64> %va)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-extrins.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-extrins.ll
+new file mode 100644
+index 000000000000..8f03a2b81291
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-extrins.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vextrins.b(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vextrins_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vextrins_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vextrins.b $vr0, $vr1, 255
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vextrins.b(<16 x i8> %va, <16 x i8> %vb, i32 255)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vextrins.h(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vextrins_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vextrins_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vextrins.h $vr0, $vr1, 255
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vextrins.h(<8 x i16> %va, <8 x i16> %vb, i32 255)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vextrins.w(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vextrins_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vextrins_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vextrins.w $vr0, $vr1, 255
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vextrins.w(<4 x i32> %va, <4 x i32> %vb, i32 255)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vextrins.d(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vextrins_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vextrins_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vextrins.d $vr0, $vr1, 255
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vextrins.d(<2 x i64> %va, <2 x i64> %vb, i32 255)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fadd.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fadd.ll
+new file mode 100644
+index 000000000000..569002314c92
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fadd.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfadd.s(<4 x float>, <4 x float>)
++
++define <4 x float> @lsx_vfadd_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfadd_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfadd.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfadd.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfadd.d(<2 x double>, <2 x double>)
++
++define <2 x double> @lsx_vfadd_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfadd_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfadd.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfadd.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fclass.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fclass.ll
+new file mode 100644
+index 000000000000..0c6682187101
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fclass.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x i32> @llvm.loongarch.lsx.vfclass.s(<4 x float>)
++
++define <4 x i32> @lsx_vfclass_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vfclass_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfclass.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfclass.s(<4 x float> %va)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfclass.d(<2 x double>)
++
++define <2 x i64> @lsx_vfclass_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vfclass_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfclass.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfclass.d(<2 x double> %va)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcmp.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcmp.ll
+new file mode 100644
+index 000000000000..669c53b73b16
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcmp.ll
+@@ -0,0 +1,530 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.caf.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_caf_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_caf_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.caf.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.caf.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.caf.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_caf_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_caf_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.caf.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.caf.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.cun.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_cun_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_cun_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.cun.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.cun.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.cun.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_cun_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_cun_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.cun.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.cun.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.ceq.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_ceq_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_ceq_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.ceq.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.ceq.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.ceq.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_ceq_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_ceq_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.ceq.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.ceq.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.cueq.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_cueq_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_cueq_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.cueq.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.cueq.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.cueq.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_cueq_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_cueq_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.cueq.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.cueq.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.clt.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_clt_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_clt_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.clt.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.clt.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.clt.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_clt_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_clt_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.clt.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.clt.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.cult.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_cult_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_cult_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.cult.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.cult.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.cult.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_cult_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_cult_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.cult.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.cult.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.cle.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_cle_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_cle_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.cle.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.cle.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.cle.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_cle_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_cle_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.cle.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.cle.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.cule.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_cule_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_cule_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.cule.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.cule.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.cule.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_cule_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_cule_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.cule.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.cule.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.cne.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_cne_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_cne_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.cne.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.cne.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.cne.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_cne_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_cne_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.cne.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.cne.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.cor.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_cor_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_cor_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.cor.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.cor.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.cor.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_cor_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_cor_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.cor.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.cor.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.cune.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_cune_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_cune_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.cune.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.cune.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.cune.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_cune_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_cune_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.cune.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.cune.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.saf.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_saf_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_saf_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.saf.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.saf.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.saf.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_saf_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_saf_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.saf.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.saf.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.sun.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_sun_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_sun_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.sun.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.sun.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.sun.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_sun_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_sun_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.sun.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.sun.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.seq.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_seq_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_seq_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.seq.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.seq.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.seq.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_seq_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_seq_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.seq.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.seq.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.sueq.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_sueq_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_sueq_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.sueq.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.sueq.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.sueq.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_sueq_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_sueq_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.sueq.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.sueq.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.slt.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_slt_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_slt_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.slt.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.slt.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.slt.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_slt_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_slt_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.slt.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.slt.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.sult.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_sult_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_sult_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.sult.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.sult.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.sult.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_sult_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_sult_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.sult.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.sult.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.sle.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_sle_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_sle_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.sle.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.sle.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.sle.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_sle_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_sle_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.sle.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.sle.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.sule.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_sule_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_sule_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.sule.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.sule.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.sule.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_sule_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_sule_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.sule.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.sule.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.sne.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_sne_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_sne_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.sne.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.sne.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.sne.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_sne_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_sne_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.sne.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.sne.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.sor.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_sor_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_sor_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.sor.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.sor.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.sor.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_sor_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_sor_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.sor.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.sor.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vfcmp.sune.s(<4 x float>, <4 x float>)
++
++define <4 x i32> @lsx_vfcmp_sune_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_sune_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.sune.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vfcmp.sune.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vfcmp.sune.d(<2 x double>, <2 x double>)
++
++define <2 x i64> @lsx_vfcmp_sune_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcmp_sune_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcmp.sune.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vfcmp.sune.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcvt.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcvt.ll
+new file mode 100644
+index 000000000000..a6a151a96d84
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcvt.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <8 x i16> @llvm.loongarch.lsx.vfcvt.h.s(<4 x float>, <4 x float>)
++
++define <8 x i16> @lsx_vfcvt_h_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcvt_h_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcvt.h.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vfcvt.h.s(<4 x float> %va, <4 x float> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x float> @llvm.loongarch.lsx.vfcvt.s.d(<2 x double>, <2 x double>)
++
++define <4 x float> @lsx_vfcvt_s_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfcvt_s_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcvt.s.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfcvt.s.d(<2 x double> %va, <2 x double> %vb)
++  ret <4 x float> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcvth.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcvth.ll
+new file mode 100644
+index 000000000000..a9e4328bd011
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcvth.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfcvth.s.h(<8 x i16>)
++
++define <4 x float> @lsx_vfcvth_s_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vfcvth_s_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcvth.s.h $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfcvth.s.h(<8 x i16> %va)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfcvth.d.s(<4 x float>)
++
++define <2 x double> @lsx_vfcvth_d_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vfcvth_d_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcvth.d.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfcvth.d.s(<4 x float> %va)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcvtl.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcvtl.ll
+new file mode 100644
+index 000000000000..9a69964bb227
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcvtl.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfcvtl.s.h(<8 x i16>)
++
++define <4 x float> @lsx_vfcvtl_s_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vfcvtl_s_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcvtl.s.h $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfcvtl.s.h(<8 x i16> %va)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfcvtl.d.s(<4 x float>)
++
++define <2 x double> @lsx_vfcvtl_d_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vfcvtl_d_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfcvtl.d.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfcvtl.d.s(<4 x float> %va)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fdiv.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fdiv.ll
+new file mode 100644
+index 000000000000..1ca8e5e2c0e9
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fdiv.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfdiv.s(<4 x float>, <4 x float>)
++
++define <4 x float> @lsx_vfdiv_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfdiv_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfdiv.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfdiv.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfdiv.d(<2 x double>, <2 x double>)
++
++define <2 x double> @lsx_vfdiv_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfdiv_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfdiv.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfdiv.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ffint.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ffint.ll
+new file mode 100644
+index 000000000000..62fbcfa339cd
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ffint.ll
+@@ -0,0 +1,86 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vffint.s.w(<4 x i32>)
++
++define <4 x float> @lsx_vffint_s_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vffint_s_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vffint.s.w $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vffint.s.w(<4 x i32> %va)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vffint.d.l(<2 x i64>)
++
++define <2 x double> @lsx_vffint_d_l(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vffint_d_l:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vffint.d.l $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vffint.d.l(<2 x i64> %va)
++  ret <2 x double> %res
++}
++
++declare <4 x float> @llvm.loongarch.lsx.vffint.s.wu(<4 x i32>)
++
++define <4 x float> @lsx_vffint_s_wu(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vffint_s_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vffint.s.wu $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vffint.s.wu(<4 x i32> %va)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vffint.d.lu(<2 x i64>)
++
++define <2 x double> @lsx_vffint_d_lu(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vffint_d_lu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vffint.d.lu $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vffint.d.lu(<2 x i64> %va)
++  ret <2 x double> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vffintl.d.w(<4 x i32>)
++
++define <2 x double> @lsx_vffintl_d_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vffintl_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vffintl.d.w $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vffintl.d.w(<4 x i32> %va)
++  ret <2 x double> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vffinth.d.w(<4 x i32>)
++
++define <2 x double> @lsx_vffinth_d_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vffinth_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vffinth.d.w $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vffinth.d.w(<4 x i32> %va)
++  ret <2 x double> %res
++}
++
++declare <4 x float> @llvm.loongarch.lsx.vffint.s.l(<2 x i64>, <2 x i64>)
++
++define <4 x float> @lsx_vffint_s_l(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vffint_s_l:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vffint.s.l $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vffint.s.l(<2 x i64> %va, <2 x i64> %vb)
++  ret <4 x float> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-flogb.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-flogb.ll
+new file mode 100644
+index 000000000000..d8382acc70ed
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-flogb.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vflogb.s(<4 x float>)
++
++define <4 x float> @lsx_vflogb_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vflogb_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vflogb.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vflogb.s(<4 x float> %va)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vflogb.d(<2 x double>)
++
++define <2 x double> @lsx_vflogb_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vflogb_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vflogb.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vflogb.d(<2 x double> %va)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmadd.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmadd.ll
+new file mode 100644
+index 000000000000..adbaf6c76b1b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmadd.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfmadd.s(<4 x float>, <4 x float>, <4 x float>)
++
++define <4 x float> @lsx_vfmadd_s(<4 x float> %va, <4 x float> %vb, <4 x float> %vc) nounwind {
++; CHECK-LABEL: lsx_vfmadd_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfmadd.s $vr0, $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfmadd.s(<4 x float> %va, <4 x float> %vb, <4 x float> %vc)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfmadd.d(<2 x double>, <2 x double>, <2 x double>)
++
++define <2 x double> @lsx_vfmadd_d(<2 x double> %va, <2 x double> %vb, <2 x double> %vc) nounwind {
++; CHECK-LABEL: lsx_vfmadd_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfmadd.d $vr0, $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfmadd.d(<2 x double> %va, <2 x double> %vb, <2 x double> %vc)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmax.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmax.ll
+new file mode 100644
+index 000000000000..89f757c4e456
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmax.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfmax.s(<4 x float>, <4 x float>)
++
++define <4 x float> @lsx_vfmax_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfmax_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfmax.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfmax.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfmax.d(<2 x double>, <2 x double>)
++
++define <2 x double> @lsx_vfmax_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfmax_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfmax.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfmax.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmaxa.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmaxa.ll
+new file mode 100644
+index 000000000000..5662acc0b9a1
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmaxa.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfmaxa.s(<4 x float>, <4 x float>)
++
++define <4 x float> @lsx_vfmaxa_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfmaxa_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfmaxa.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfmaxa.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfmaxa.d(<2 x double>, <2 x double>)
++
++define <2 x double> @lsx_vfmaxa_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfmaxa_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfmaxa.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfmaxa.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmin.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmin.ll
+new file mode 100644
+index 000000000000..0f844240277f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmin.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfmin.s(<4 x float>, <4 x float>)
++
++define <4 x float> @lsx_vfmin_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfmin_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfmin.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfmin.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfmin.d(<2 x double>, <2 x double>)
++
++define <2 x double> @lsx_vfmin_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfmin_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfmin.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfmin.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmina.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmina.ll
+new file mode 100644
+index 000000000000..27f70b5fba32
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmina.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfmina.s(<4 x float>, <4 x float>)
++
++define <4 x float> @lsx_vfmina_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfmina_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfmina.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfmina.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfmina.d(<2 x double>, <2 x double>)
++
++define <2 x double> @lsx_vfmina_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfmina_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfmina.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfmina.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmsub.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmsub.ll
+new file mode 100644
+index 000000000000..856ca9cadbd9
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmsub.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfmsub.s(<4 x float>, <4 x float>, <4 x float>)
++
++define <4 x float> @lsx_vfmsub_s(<4 x float> %va, <4 x float> %vb, <4 x float> %vc) nounwind {
++; CHECK-LABEL: lsx_vfmsub_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfmsub.s $vr0, $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfmsub.s(<4 x float> %va, <4 x float> %vb, <4 x float> %vc)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfmsub.d(<2 x double>, <2 x double>, <2 x double>)
++
++define <2 x double> @lsx_vfmsub_d(<2 x double> %va, <2 x double> %vb, <2 x double> %vc) nounwind {
++; CHECK-LABEL: lsx_vfmsub_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfmsub.d $vr0, $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfmsub.d(<2 x double> %va, <2 x double> %vb, <2 x double> %vc)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmul.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmul.ll
+new file mode 100644
+index 000000000000..1e6c4c77d536
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fmul.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfmul.s(<4 x float>, <4 x float>)
++
++define <4 x float> @lsx_vfmul_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfmul_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfmul.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfmul.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfmul.d(<2 x double>, <2 x double>)
++
++define <2 x double> @lsx_vfmul_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfmul_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfmul.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfmul.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fnmadd.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fnmadd.ll
+new file mode 100644
+index 000000000000..e1a9ea78ef9d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fnmadd.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfnmadd.s(<4 x float>, <4 x float>, <4 x float>)
++
++define <4 x float> @lsx_vfnmadd_s(<4 x float> %va, <4 x float> %vb, <4 x float> %vc) nounwind {
++; CHECK-LABEL: lsx_vfnmadd_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfnmadd.s $vr0, $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfnmadd.s(<4 x float> %va, <4 x float> %vb, <4 x float> %vc)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfnmadd.d(<2 x double>, <2 x double>, <2 x double>)
++
++define <2 x double> @lsx_vfnmadd_d(<2 x double> %va, <2 x double> %vb, <2 x double> %vc) nounwind {
++; CHECK-LABEL: lsx_vfnmadd_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfnmadd.d $vr0, $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfnmadd.d(<2 x double> %va, <2 x double> %vb, <2 x double> %vc)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fnmsub.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fnmsub.ll
+new file mode 100644
+index 000000000000..46db0f4a5061
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fnmsub.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfnmsub.s(<4 x float>, <4 x float>, <4 x float>)
++
++define <4 x float> @lsx_vfnmsub_s(<4 x float> %va, <4 x float> %vb, <4 x float> %vc) nounwind {
++; CHECK-LABEL: lsx_vfnmsub_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfnmsub.s $vr0, $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfnmsub.s(<4 x float> %va, <4 x float> %vb, <4 x float> %vc)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfnmsub.d(<2 x double>, <2 x double>, <2 x double>)
++
++define <2 x double> @lsx_vfnmsub_d(<2 x double> %va, <2 x double> %vb, <2 x double> %vc) nounwind {
++; CHECK-LABEL: lsx_vfnmsub_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfnmsub.d $vr0, $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfnmsub.d(<2 x double> %va, <2 x double> %vb, <2 x double> %vc)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecip.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecip.ll
+new file mode 100644
+index 000000000000..669fde5912d4
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecip.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfrecip.s(<4 x float>)
++
++define <4 x float> @lsx_vfrecip_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vfrecip_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrecip.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfrecip.s(<4 x float> %va)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfrecip.d(<2 x double>)
++
++define <2 x double> @lsx_vfrecip_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vfrecip_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrecip.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfrecip.d(<2 x double> %va)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frint.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frint.ll
+new file mode 100644
+index 000000000000..8d872fc72962
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frint.ll
+@@ -0,0 +1,122 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfrintrne.s(<4 x float>)
++
++define <4 x float> @lsx_vfrintrne_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vfrintrne_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrintrne.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfrintrne.s(<4 x float> %va)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfrintrne.d(<2 x double>)
++
++define <2 x double> @lsx_vfrintrne_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vfrintrne_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrintrne.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfrintrne.d(<2 x double> %va)
++  ret <2 x double> %res
++}
++
++declare <4 x float> @llvm.loongarch.lsx.vfrintrz.s(<4 x float>)
++
++define <4 x float> @lsx_vfrintrz_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vfrintrz_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrintrz.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfrintrz.s(<4 x float> %va)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfrintrz.d(<2 x double>)
++
++define <2 x double> @lsx_vfrintrz_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vfrintrz_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrintrz.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfrintrz.d(<2 x double> %va)
++  ret <2 x double> %res
++}
++
++declare <4 x float> @llvm.loongarch.lsx.vfrintrp.s(<4 x float>)
++
++define <4 x float> @lsx_vfrintrp_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vfrintrp_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrintrp.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfrintrp.s(<4 x float> %va)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfrintrp.d(<2 x double>)
++
++define <2 x double> @lsx_vfrintrp_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vfrintrp_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrintrp.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfrintrp.d(<2 x double> %va)
++  ret <2 x double> %res
++}
++
++declare <4 x float> @llvm.loongarch.lsx.vfrintrm.s(<4 x float>)
++
++define <4 x float> @lsx_vfrintrm_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vfrintrm_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrintrm.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfrintrm.s(<4 x float> %va)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfrintrm.d(<2 x double>)
++
++define <2 x double> @lsx_vfrintrm_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vfrintrm_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrintrm.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfrintrm.d(<2 x double> %va)
++  ret <2 x double> %res
++}
++
++declare <4 x float> @llvm.loongarch.lsx.vfrint.s(<4 x float>)
++
++define <4 x float> @lsx_vfrint_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vfrint_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrint.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfrint.s(<4 x float> %va)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfrint.d(<2 x double>)
++
++define <2 x double> @lsx_vfrint_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vfrint_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrint.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfrint.d(<2 x double> %va)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrt.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrt.ll
+new file mode 100644
+index 000000000000..326d87308b0b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrt.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfrsqrt.s(<4 x float>)
++
++define <4 x float> @lsx_vfrsqrt_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vfrsqrt_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrsqrt.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfrsqrt.s(<4 x float> %va)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfrsqrt.d(<2 x double>)
++
++define <2 x double> @lsx_vfrsqrt_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vfrsqrt_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrsqrt.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfrsqrt.d(<2 x double> %va)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frstp.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frstp.ll
+new file mode 100644
+index 000000000000..5c072b194d4f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frstp.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vfrstp.b(<16 x i8>, <16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vfrstp_b(<16 x i8> %va, <16 x i8> %vb, <16 x i8> %vc) nounwind {
++; CHECK-LABEL: lsx_vfrstp_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrstp.b $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vfrstp.b(<16 x i8> %va, <16 x i8> %vb, <16 x i8> %vc)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vfrstp.h(<8 x i16>, <8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vfrstp_h(<8 x i16> %va, <8 x i16> %vb, <8 x i16> %vc) nounwind {
++; CHECK-LABEL: lsx_vfrstp_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrstp.h $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vfrstp.h(<8 x i16> %va, <8 x i16> %vb, <8 x i16> %vc)
++  ret <8 x i16> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vfrstpi.b(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vfrstpi_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vfrstpi_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrstpi.b $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vfrstpi.b(<16 x i8> %va, <16 x i8> %vb, i32 1)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vfrstpi.h(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vfrstpi_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vfrstpi_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrstpi.h $vr0, $vr1, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vfrstpi.h(<8 x i16> %va, <8 x i16> %vb, i32 31)
++  ret <8 x i16> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fsqrt.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fsqrt.ll
+new file mode 100644
+index 000000000000..55bffba9e99e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fsqrt.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfsqrt.s(<4 x float>)
++
++define <4 x float> @lsx_vfsqrt_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vfsqrt_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfsqrt.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfsqrt.s(<4 x float> %va)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfsqrt.d(<2 x double>)
++
++define <2 x double> @lsx_vfsqrt_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vfsqrt_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfsqrt.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfsqrt.d(<2 x double> %va)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fsub.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fsub.ll
+new file mode 100644
+index 000000000000..2beba4a70dc9
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fsub.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfsub.s(<4 x float>, <4 x float>)
++
++define <4 x float> @lsx_vfsub_s(<4 x float> %va, <4 x float> %vb) nounwind {
++; CHECK-LABEL: lsx_vfsub_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfsub.s $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfsub.s(<4 x float> %va, <4 x float> %vb)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfsub.d(<2 x double>, <2 x double>)
++
++define <2 x double> @lsx_vfsub_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vfsub_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfsub.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfsub.d(<2 x double> %va, <2 x double> %vb)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ftint.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ftint.ll
+new file mode 100644
+index 000000000000..2a494cd7fa87
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ftint.ll
+@@ -0,0 +1,350 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x i32> @llvm.loongarch.lsx.vftintrne.w.s(<4 x float>)
++
++define <4 x i32> @lsx_vftintrne_w_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrne_w_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrne.w.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.s(<4 x float> %va)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vftintrne.l.d(<2 x double>)
++
++define <2 x i64> @lsx_vftintrne_l_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrne_l_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrne.l.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vftintrne.l.d(<2 x double> %va)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vftintrz.w.s(<4 x float>)
++
++define <4 x i32> @lsx_vftintrz_w_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrz_w_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrz.w.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.s(<4 x float> %va)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vftintrz.l.d(<2 x double>)
++
++define <2 x i64> @lsx_vftintrz_l_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrz_l_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrz.l.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vftintrz.l.d(<2 x double> %va)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vftintrp.w.s(<4 x float>)
++
++define <4 x i32> @lsx_vftintrp_w_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrp_w_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrp.w.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.s(<4 x float> %va)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vftintrp.l.d(<2 x double>)
++
++define <2 x i64> @lsx_vftintrp_l_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrp_l_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrp.l.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vftintrp.l.d(<2 x double> %va)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vftintrm.w.s(<4 x float>)
++
++define <4 x i32> @lsx_vftintrm_w_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrm_w_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrm.w.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.s(<4 x float> %va)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vftintrm.l.d(<2 x double>)
++
++define <2 x i64> @lsx_vftintrm_l_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrm_l_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrm.l.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vftintrm.l.d(<2 x double> %va)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vftint.w.s(<4 x float>)
++
++define <4 x i32> @lsx_vftint_w_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vftint_w_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftint.w.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vftint.w.s(<4 x float> %va)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vftint.l.d(<2 x double>)
++
++define <2 x i64> @lsx_vftint_l_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vftint_l_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftint.l.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vftint.l.d(<2 x double> %va)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vftintrz.wu.s(<4 x float>)
++
++define <4 x i32> @lsx_vftintrz_wu_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrz_wu_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrz.wu.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vftintrz.wu.s(<4 x float> %va)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vftintrz.lu.d(<2 x double>)
++
++define <2 x i64> @lsx_vftintrz_lu_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrz_lu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrz.lu.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vftintrz.lu.d(<2 x double> %va)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vftint.wu.s(<4 x float>)
++
++define <4 x i32> @lsx_vftint_wu_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vftint_wu_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftint.wu.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vftint.wu.s(<4 x float> %va)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vftint.lu.d(<2 x double>)
++
++define <2 x i64> @lsx_vftint_lu_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vftint_lu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftint.lu.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vftint.lu.d(<2 x double> %va)
++  ret <2 x i64> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vftintrne.w.d(<2 x double>, <2 x double>)
++
++define <4 x i32> @lsx_vftintrne_w_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vftintrne_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrne.w.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.d(<2 x double> %va, <2 x double> %vb)
++  ret <4 x i32> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vftintrz.w.d(<2 x double>, <2 x double>)
++
++define <4 x i32> @lsx_vftintrz_w_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vftintrz_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrz.w.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.d(<2 x double> %va, <2 x double> %vb)
++  ret <4 x i32> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vftintrp.w.d(<2 x double>, <2 x double>)
++
++define <4 x i32> @lsx_vftintrp_w_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vftintrp_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrp.w.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.d(<2 x double> %va, <2 x double> %vb)
++  ret <4 x i32> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vftintrm.w.d(<2 x double>, <2 x double>)
++
++define <4 x i32> @lsx_vftintrm_w_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vftintrm_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrm.w.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.d(<2 x double> %va, <2 x double> %vb)
++  ret <4 x i32> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vftint.w.d(<2 x double>, <2 x double>)
++
++define <4 x i32> @lsx_vftint_w_d(<2 x double> %va, <2 x double> %vb) nounwind {
++; CHECK-LABEL: lsx_vftint_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftint.w.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vftint.w.d(<2 x double> %va, <2 x double> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vftintrnel.l.s(<4 x float>)
++
++define <2 x i64> @lsx_vftintrnel_l_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrnel_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrnel.l.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vftintrnel.l.s(<4 x float> %va)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vftintrneh.l.s(<4 x float>)
++
++define <2 x i64> @lsx_vftintrneh_l_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrneh_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrneh.l.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vftintrneh.l.s(<4 x float> %va)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vftintrzl.l.s(<4 x float>)
++
++define <2 x i64> @lsx_vftintrzl_l_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrzl_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrzl.l.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vftintrzl.l.s(<4 x float> %va)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vftintrzh.l.s(<4 x float>)
++
++define <2 x i64> @lsx_vftintrzh_l_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrzh_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrzh.l.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vftintrzh.l.s(<4 x float> %va)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vftintrpl.l.s(<4 x float>)
++
++define <2 x i64> @lsx_vftintrpl_l_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrpl_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrpl.l.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vftintrpl.l.s(<4 x float> %va)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vftintrph.l.s(<4 x float>)
++
++define <2 x i64> @lsx_vftintrph_l_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrph_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrph.l.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vftintrph.l.s(<4 x float> %va)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vftintrml.l.s(<4 x float>)
++
++define <2 x i64> @lsx_vftintrml_l_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrml_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrml.l.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vftintrml.l.s(<4 x float> %va)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vftintrmh.l.s(<4 x float>)
++
++define <2 x i64> @lsx_vftintrmh_l_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vftintrmh_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintrmh.l.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vftintrmh.l.s(<4 x float> %va)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vftintl.l.s(<4 x float>)
++
++define <2 x i64> @lsx_vftintl_l_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vftintl_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftintl.l.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vftintl.l.s(<4 x float> %va)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vftinth.l.s(<4 x float>)
++
++define <2 x i64> @lsx_vftinth_l_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vftinth_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vftinth.l.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vftinth.l.s(<4 x float> %va)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-haddw.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-haddw.ll
+new file mode 100644
+index 000000000000..05725582334a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-haddw.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <8 x i16> @llvm.loongarch.lsx.vhaddw.h.b(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vhaddw_h_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vhaddw_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vhaddw.h.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vhaddw.h.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vhaddw.w.h(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vhaddw_w_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vhaddw_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vhaddw.w.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vhaddw.w.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vhaddw.d.w(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vhaddw_d_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vhaddw_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vhaddw.d.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vhaddw.d.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vhaddw.q.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vhaddw_q_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vhaddw_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vhaddw.q.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vhaddw.q.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vhaddw.hu.bu(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vhaddw_hu_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vhaddw_hu_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vhaddw.hu.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vhaddw.hu.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vhaddw.wu.hu(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vhaddw_wu_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vhaddw_wu_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vhaddw.wu.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vhaddw.wu.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vhaddw.du.wu(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vhaddw_du_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vhaddw_du_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vhaddw.du.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vhaddw.du.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vhaddw.qu.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vhaddw_qu_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vhaddw_qu_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vhaddw.qu.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vhaddw.qu.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-hsubw.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-hsubw.ll
+new file mode 100644
+index 000000000000..dd5815b2ea85
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-hsubw.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <8 x i16> @llvm.loongarch.lsx.vhsubw.h.b(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vhsubw_h_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vhsubw_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vhsubw.h.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vhsubw.h.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vhsubw.w.h(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vhsubw_w_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vhsubw_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vhsubw.w.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vhsubw.w.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vhsubw.d.w(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vhsubw_d_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vhsubw_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vhsubw.d.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vhsubw.d.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vhsubw.q.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vhsubw_q_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vhsubw_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vhsubw.q.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vhsubw.q.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vhsubw.hu.bu(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vhsubw_hu_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vhsubw_hu_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vhsubw.hu.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vhsubw.hu.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vhsubw.wu.hu(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vhsubw_wu_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vhsubw_wu_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vhsubw.wu.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vhsubw.wu.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vhsubw.du.wu(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vhsubw_du_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vhsubw_du_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vhsubw.du.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vhsubw.du.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vhsubw.qu.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vhsubw_qu_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vhsubw_qu_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vhsubw.qu.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vhsubw.qu.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ilv.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ilv.ll
+new file mode 100644
+index 000000000000..77b0b3484df8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ilv.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vilvl.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vilvl_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vilvl_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vilvl.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vilvl.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vilvl.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vilvl_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vilvl_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vilvl.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vilvl.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vilvl_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vilvl_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vilvl.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vilvl.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vilvl.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vilvl_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vilvl_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vilvl.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vilvl.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vilvh.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vilvh_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vilvh_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vilvh.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vilvh.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vilvh.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vilvh_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vilvh_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vilvh.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vilvh.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vilvh.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vilvh_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vilvh_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vilvh.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vilvh.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vilvh.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vilvh_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vilvh_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vilvh.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vilvh.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-insgr2vr.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-insgr2vr.ll
+new file mode 100644
+index 000000000000..61d2cbd28066
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-insgr2vr.ll
+@@ -0,0 +1,54 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8>, i32, i32)
++
++define <16 x i8> @lsx_vinsgr2vr_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vinsgr2vr_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    ori $a0, $zero, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8> %va, i32 1, i32 15)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16>, i32, i32)
++
++define <8 x i16> @lsx_vinsgr2vr_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vinsgr2vr_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    ori $a0, $zero, 1
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 7
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16> %va, i32 1, i32 7)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32>, i32, i32)
++
++define <4 x i32> @lsx_vinsgr2vr_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vinsgr2vr_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    ori $a0, $zero, 1
++; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 3
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32> %va, i32 1, i32 3)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64>, i64, i32)
++
++define <2 x i64> @lsx_vinsgr2vr_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vinsgr2vr_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    ori $a0, $zero, 1
++; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> %va, i64 1, i32 1)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ld.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ld.ll
+new file mode 100644
+index 000000000000..b9e2ff8088d8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ld.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vld(i8*, i32)
++
++define <16 x i8> @lsx_vld(i8* %p) nounwind {
++; CHECK-LABEL: lsx_vld:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vld(i8* %p, i32 1)
++  ret <16 x i8> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vldx(i8*, i64)
++
++define <16 x i8> @lsx_vldx(i8* %p, i64 %b) nounwind {
++; CHECK-LABEL: lsx_vldx:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vldx $vr0, $a0, $a1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vldx(i8* %p, i64 %b)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldi.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldi.ll
+new file mode 100644
+index 000000000000..ace910b54d9a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldi.ll
+@@ -0,0 +1,62 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <2 x i64> @llvm.loongarch.lsx.vldi(i32)
++
++define <2 x i64> @lsx_vldi() nounwind {
++; CHECK-LABEL: lsx_vldi:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vldi $vr0, 4095
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vldi(i32 4095)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vrepli.b(i32)
++
++define <16 x i8> @lsx_vrepli_b() nounwind {
++; CHECK-LABEL: lsx_vrepli_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vrepli.b $vr0, 511
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vrepli.b(i32 511)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vrepli.h(i32)
++
++define <8 x i16> @lsx_vrepli_h() nounwind {
++; CHECK-LABEL: lsx_vrepli_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vrepli.h $vr0, 511
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vrepli.h(i32 511)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vrepli.w(i32)
++
++define <4 x i32> @lsx_vrepli_w() nounwind {
++; CHECK-LABEL: lsx_vrepli_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vrepli.w $vr0, 511
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vrepli.w(i32 511)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vrepli.d(i32)
++
++define <2 x i64> @lsx_vrepli_d() nounwind {
++; CHECK-LABEL: lsx_vrepli_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vrepli.d $vr0, 511
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vrepli.d(i32 511)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldrepl.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldrepl.ll
+new file mode 100644
+index 000000000000..1a9cf3d3a766
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldrepl.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vldrepl.b(i8*, i32)
++
++define <16 x i8> @lsx_vldrepl_b(i8* %p, i32 %b) nounwind {
++; CHECK-LABEL: lsx_vldrepl_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vldrepl.b $vr0, $a0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vldrepl.b(i8* %p, i32 1)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vldrepl.h(i8*, i32)
++
++define <8 x i16> @lsx_vldrepl_h(i8* %p, i32 %b) nounwind {
++; CHECK-LABEL: lsx_vldrepl_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vldrepl.h $vr0, $a0, 2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vldrepl.h(i8* %p, i32 2)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vldrepl.w(i8*, i32)
++
++define <4 x i32> @lsx_vldrepl_w(i8* %p, i32 %b) nounwind {
++; CHECK-LABEL: lsx_vldrepl_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vldrepl.w $vr0, $a0, 4
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vldrepl.w(i8* %p, i32 4)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vldrepl.d(i8*, i32)
++
++define <2 x i64> @lsx_vldrepl_d(i8* %p, i32 %b) nounwind {
++; CHECK-LABEL: lsx_vldrepl_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vldrepl.d $vr0, $a0, 8
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vldrepl.d(i8* %p, i32 8)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-madd.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-madd.ll
+new file mode 100644
+index 000000000000..89503724fd73
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-madd.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vmadd.b(<16 x i8>, <16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vmadd_b(<16 x i8> %va, <16 x i8> %vb, <16 x i8> %vc) nounwind {
++; CHECK-LABEL: lsx_vmadd_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmadd.b $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmadd.b(<16 x i8> %va, <16 x i8> %vb, <16 x i8> %vc)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmadd.h(<8 x i16>, <8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vmadd_h(<8 x i16> %va, <8 x i16> %vb, <8 x i16> %vc) nounwind {
++; CHECK-LABEL: lsx_vmadd_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmadd.h $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmadd.h(<8 x i16> %va, <8 x i16> %vb, <8 x i16> %vc)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmadd.w(<4 x i32>, <4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vmadd_w(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc) nounwind {
++; CHECK-LABEL: lsx_vmadd_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmadd.w $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmadd.w(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmadd.d(<2 x i64>, <2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmadd_d(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc) nounwind {
++; CHECK-LABEL: lsx_vmadd_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmadd.d $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmadd.d(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-maddw.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-maddw.ll
+new file mode 100644
+index 000000000000..1e3ab25a5fcf
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-maddw.ll
+@@ -0,0 +1,290 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.b(<8 x i16>, <16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vmaddwev_h_b(<8 x i16> %va, <16 x i8> %vb, <16 x i8> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwev_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwev.h.b $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.b(<8 x i16> %va, <16 x i8> %vb, <16 x i8> %vc)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.h(<4 x i32>, <8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vmaddwev_w_h(<4 x i32> %va, <8 x i16> %vb, <8 x i16> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwev_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwev.w.h $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.h(<4 x i32> %va, <8 x i16> %vb, <8 x i16> %vc)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.w(<2 x i64>, <4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vmaddwev_d_w(<2 x i64> %va, <4 x i32> %vb, <4 x i32> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwev_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwev.d.w $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.w(<2 x i64> %va, <4 x i32> %vb, <4 x i32> %vc)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.d(<2 x i64>, <2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmaddwev_q_d(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwev_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwev.q.d $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.d(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu(<8 x i16>, <16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vmaddwev_h_bu(<8 x i16> %va, <16 x i8> %vb, <16 x i8> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwev_h_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwev.h.bu $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu(<8 x i16> %va, <16 x i8> %vb, <16 x i8> %vc)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu(<4 x i32>, <8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vmaddwev_w_hu(<4 x i32> %va, <8 x i16> %vb, <8 x i16> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwev_w_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwev.w.hu $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu(<4 x i32> %va, <8 x i16> %vb, <8 x i16> %vc)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu(<2 x i64>, <4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vmaddwev_d_wu(<2 x i64> %va, <4 x i32> %vb, <4 x i32> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwev_d_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwev.d.wu $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu(<2 x i64> %va, <4 x i32> %vb, <4 x i32> %vc)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du(<2 x i64>, <2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmaddwev_q_du(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwev_q_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwev.q.du $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu.b(<8 x i16>, <16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vmaddwev_h_bu_b(<8 x i16> %va, <16 x i8> %vb, <16 x i8> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwev_h_bu_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwev.h.bu.b $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu.b(<8 x i16> %va, <16 x i8> %vb, <16 x i8> %vc)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu.h(<4 x i32>, <8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vmaddwev_w_hu_h(<4 x i32> %va, <8 x i16> %vb, <8 x i16> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwev_w_hu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwev.w.hu.h $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu.h(<4 x i32> %va, <8 x i16> %vb, <8 x i16> %vc)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu.w(<2 x i64>, <4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vmaddwev_d_wu_w(<2 x i64> %va, <4 x i32> %vb, <4 x i32> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwev_d_wu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwev.d.wu.w $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu.w(<2 x i64> %va, <4 x i32> %vb, <4 x i32> %vc)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du.d(<2 x i64>, <2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmaddwev_q_du_d(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwev_q_du_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwev.q.du.d $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du.d(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.b(<8 x i16>, <16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vmaddwod_h_b(<8 x i16> %va, <16 x i8> %vb, <16 x i8> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwod_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwod.h.b $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.b(<8 x i16> %va, <16 x i8> %vb, <16 x i8> %vc)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.h(<4 x i32>, <8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vmaddwod_w_h(<4 x i32> %va, <8 x i16> %vb, <8 x i16> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwod_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwod.w.h $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.h(<4 x i32> %va, <8 x i16> %vb, <8 x i16> %vc)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.w(<2 x i64>, <4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vmaddwod_d_w(<2 x i64> %va, <4 x i32> %vb, <4 x i32> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwod_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwod.d.w $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.w(<2 x i64> %va, <4 x i32> %vb, <4 x i32> %vc)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.d(<2 x i64>, <2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmaddwod_q_d(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwod_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwod.q.d $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.d(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu(<8 x i16>, <16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vmaddwod_h_bu(<8 x i16> %va, <16 x i8> %vb, <16 x i8> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwod_h_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwod.h.bu $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu(<8 x i16> %va, <16 x i8> %vb, <16 x i8> %vc)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu(<4 x i32>, <8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vmaddwod_w_hu(<4 x i32> %va, <8 x i16> %vb, <8 x i16> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwod_w_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwod.w.hu $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu(<4 x i32> %va, <8 x i16> %vb, <8 x i16> %vc)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu(<2 x i64>, <4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vmaddwod_d_wu(<2 x i64> %va, <4 x i32> %vb, <4 x i32> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwod_d_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwod.d.wu $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu(<2 x i64> %va, <4 x i32> %vb, <4 x i32> %vc)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du(<2 x i64>, <2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmaddwod_q_du(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwod_q_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwod.q.du $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu.b(<8 x i16>, <16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vmaddwod_h_bu_b(<8 x i16> %va, <16 x i8> %vb, <16 x i8> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwod_h_bu_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwod.h.bu.b $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu.b(<8 x i16> %va, <16 x i8> %vb, <16 x i8> %vc)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu.h(<4 x i32>, <8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vmaddwod_w_hu_h(<4 x i32> %va, <8 x i16> %vb, <8 x i16> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwod_w_hu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwod.w.hu.h $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu.h(<4 x i32> %va, <8 x i16> %vb, <8 x i16> %vc)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu.w(<2 x i64>, <4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vmaddwod_d_wu_w(<2 x i64> %va, <4 x i32> %vb, <4 x i32> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwod_d_wu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwod.d.wu.w $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu.w(<2 x i64> %va, <4 x i32> %vb, <4 x i32> %vc)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du.d(<2 x i64>, <2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmaddwod_q_du_d(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc) nounwind {
++; CHECK-LABEL: lsx_vmaddwod_q_du_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaddwod.q.du.d $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du.d(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-max.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-max.ll
+new file mode 100644
+index 000000000000..4dd289cf6ed7
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-max.ll
+@@ -0,0 +1,194 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vmax.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vmax_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vmax_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmax.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmax.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vmax_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vmax_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmax.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmax.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vmax_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vmax_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmax.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmax.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmax.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmax_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vmax_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmax.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmax.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vmaxi_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vmaxi_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaxi.b $vr0, $vr0, -16
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8> %va, i32 -16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmaxi.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vmaxi_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vmaxi_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaxi.h $vr0, $vr0, -16
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmaxi.h(<8 x i16> %va, i32 -16)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmaxi.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vmaxi_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vmaxi_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaxi.w $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmaxi.w(<4 x i32> %va, i32 15)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaxi.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vmaxi_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vmaxi_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaxi.d $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaxi.d(<2 x i64> %va, i32 15)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vmax.bu(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vmax_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vmax_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmax.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmax.hu(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vmax_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vmax_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmax.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmax.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmax.wu(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vmax_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vmax_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmax.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmax.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmax.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmax_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vmax_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmax.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmax.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vmaxi.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vmaxi_bu(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vmaxi_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaxi.bu $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmaxi.bu(<16 x i8> %va, i32 1)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmaxi.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vmaxi_hu(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vmaxi_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaxi.hu $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmaxi.hu(<8 x i16> %va, i32 1)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmaxi.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vmaxi_wu(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vmaxi_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaxi.wu $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmaxi.wu(<4 x i32> %va, i32 31)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaxi.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vmaxi_du(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vmaxi_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmaxi.du $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaxi.du(<2 x i64> %va, i32 31)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-min.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-min.ll
+new file mode 100644
+index 000000000000..aa12a5ead6a3
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-min.ll
+@@ -0,0 +1,194 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vmin.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vmin_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vmin_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmin.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmin.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vmin_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vmin_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmin.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmin.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmin.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vmin_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vmin_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmin.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmin.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmin.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmin_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vmin_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmin.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmin.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vmini_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vmini_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmini.b $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8> %va, i32 15)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmini.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vmini_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vmini_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmini.h $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmini.h(<8 x i16> %va, i32 15)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmini.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vmini_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vmini_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmini.w $vr0, $vr0, -16
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmini.w(<4 x i32> %va, i32 -16)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmini.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vmini_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vmini_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmini.d $vr0, $vr0, -16
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmini.d(<2 x i64> %va, i32 -16)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vmin.bu(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vmin_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vmin_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmin.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmin.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmin.hu(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vmin_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vmin_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmin.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmin.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmin.wu(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vmin_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vmin_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmin.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmin.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmin.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmin_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vmin_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmin.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmin.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vmini.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vmini_bu(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vmini_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmini.bu $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmini.bu(<16 x i8> %va, i32 31)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmini.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vmini_hu(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vmini_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmini.hu $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmini.hu(<8 x i16> %va, i32 31)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmini.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vmini_wu(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vmini_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmini.wu $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmini.wu(<4 x i32> %va, i32 31)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmini.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vmini_du(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vmini_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmini.du $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmini.du(<2 x i64> %va, i32 31)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-mod.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-mod.ll
+new file mode 100644
+index 000000000000..6b3dc6865584
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-mod.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vmod.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vmod_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vmod_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmod.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmod.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmod.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vmod_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vmod_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmod.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmod.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmod.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vmod_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vmod_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmod.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmod.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmod.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmod_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vmod_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmod.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmod.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vmod.bu(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vmod_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vmod_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmod.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmod.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmod.hu(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vmod_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vmod_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmod.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmod.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmod.wu(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vmod_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vmod_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmod.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmod.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmod.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmod_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vmod_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmod.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmod.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-mskgez.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-mskgez.ll
+new file mode 100644
+index 000000000000..3ecd777aee67
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-mskgez.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vmskgez.b(<16 x i8>)
++
++define <16 x i8> @lsx_vmskgez_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vmskgez_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmskgez.b $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmskgez.b(<16 x i8> %va)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-mskltz.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-mskltz.ll
+new file mode 100644
+index 000000000000..be00c76137c7
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-mskltz.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vmskltz.b(<16 x i8>)
++
++define <16 x i8> @lsx_vmskltz_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vmskltz_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmskltz.b $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmskltz.b(<16 x i8> %va)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmskltz.h(<8 x i16>)
++
++define <8 x i16> @lsx_vmskltz_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vmskltz_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmskltz.h $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmskltz.h(<8 x i16> %va)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmskltz.w(<4 x i32>)
++
++define <4 x i32> @lsx_vmskltz_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vmskltz_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmskltz.w $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmskltz.w(<4 x i32> %va)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmskltz.d(<2 x i64>)
++
++define <2 x i64> @lsx_vmskltz_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vmskltz_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmskltz.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmskltz.d(<2 x i64> %va)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-msknz.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-msknz.ll
+new file mode 100644
+index 000000000000..02f1752f7190
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-msknz.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vmsknz.b(<16 x i8>)
++
++define <16 x i8> @lsx_vmsknz_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vmsknz_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmsknz.b $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmsknz.b(<16 x i8> %va)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-msub.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-msub.ll
+new file mode 100644
+index 000000000000..98684e10c78e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-msub.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vmsub.b(<16 x i8>, <16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vmsub_b(<16 x i8> %va, <16 x i8> %vb, <16 x i8> %vc) nounwind {
++; CHECK-LABEL: lsx_vmsub_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmsub.b $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmsub.b(<16 x i8> %va, <16 x i8> %vb, <16 x i8> %vc)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmsub.h(<8 x i16>, <8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vmsub_h(<8 x i16> %va, <8 x i16> %vb, <8 x i16> %vc) nounwind {
++; CHECK-LABEL: lsx_vmsub_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmsub.h $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmsub.h(<8 x i16> %va, <8 x i16> %vb, <8 x i16> %vc)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmsub.w(<4 x i32>, <4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vmsub_w(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc) nounwind {
++; CHECK-LABEL: lsx_vmsub_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmsub.w $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmsub.w(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmsub.d(<2 x i64>, <2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmsub_d(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc) nounwind {
++; CHECK-LABEL: lsx_vmsub_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmsub.d $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmsub.d(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-muh.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-muh.ll
+new file mode 100644
+index 000000000000..a4deb8f8f823
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-muh.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vmuh.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vmuh_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vmuh_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmuh.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmuh.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmuh.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vmuh_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vmuh_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmuh.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmuh.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmuh.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vmuh_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vmuh_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmuh.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmuh.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmuh.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmuh_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vmuh_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmuh.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmuh.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vmuh.bu(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vmuh_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vmuh_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmuh.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmuh.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmuh.hu(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vmuh_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vmuh_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmuh.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmuh.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmuh.wu(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vmuh_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vmuh_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmuh.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmuh.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmuh.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmuh_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vmuh_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmuh.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmuh.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-mul.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-mul.ll
+new file mode 100644
+index 000000000000..aca60d1663b7
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-mul.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vmul.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vmul_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vmul_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmul.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmul.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmul.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vmul_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vmul_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmul.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmul.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmul.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vmul_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vmul_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmul.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmul.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmul.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmul_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vmul_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmul.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmul.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-mulw.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-mulw.ll
+new file mode 100644
+index 000000000000..eb55c1f809e3
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-mulw.ll
+@@ -0,0 +1,290 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <8 x i16> @llvm.loongarch.lsx.vmulwev.h.b(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vmulwev_h_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwev_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwev.h.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmulwev.w.h(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vmulwev_w_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwev_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwev.w.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmulwev.d.w(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vmulwev_d_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwev_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwev.d.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmulwev.q.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmulwev_q_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwev_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwev.q.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vmulwev_h_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwev_h_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwev.h.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vmulwev_w_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwev_w_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwev.w.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vmulwev_d_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwev_d_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwev.d.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmulwev_q_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwev_q_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwev.q.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu.b(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vmulwev_h_bu_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwev_h_bu_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwev.h.bu.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu.h(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vmulwev_w_hu_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwev_w_hu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwev.w.hu.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu.w(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vmulwev_d_wu_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwev_d_wu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwev.d.wu.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmulwev_q_du_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwev_q_du_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwev.q.du.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmulwod.h.b(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vmulwod_h_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwod_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwod.h.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmulwod.w.h(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vmulwod_w_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwod_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwod.w.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmulwod.d.w(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vmulwod_d_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwod_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwod.d.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmulwod.q.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmulwod_q_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwod_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwod.q.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vmulwod_h_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwod_h_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwod.h.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vmulwod_w_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwod_w_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwod.w.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vmulwod_d_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwod_d_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwod.d.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmulwod_q_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwod_q_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwod.q.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu.b(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vmulwod_h_bu_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwod_h_bu_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwod.h.bu.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu.h(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vmulwod_w_hu_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwod_w_hu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwod.w.hu.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu.w(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vmulwod_d_wu_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwod_d_wu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwod.d.wu.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vmulwod_q_du_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vmulwod_q_du_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vmulwod.q.du.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-neg.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-neg.ll
+new file mode 100644
+index 000000000000..43c6e9757614
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-neg.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vneg.b(<16 x i8>)
++
++define <16 x i8> @lsx_vneg_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vneg_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vneg.b $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vneg.b(<16 x i8> %va)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vneg.h(<8 x i16>)
++
++define <8 x i16> @lsx_vneg_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vneg_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vneg.h $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vneg.h(<8 x i16> %va)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vneg.w(<4 x i32>)
++
++define <4 x i32> @lsx_vneg_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vneg_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vneg.w $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vneg.w(<4 x i32> %va)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vneg.d(<2 x i64>)
++
++define <2 x i64> @lsx_vneg_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vneg_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vneg.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vneg.d(<2 x i64> %va)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-nor.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-nor.ll
+new file mode 100644
+index 000000000000..16619225f2d1
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-nor.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vnor.v(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vnor_v(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vnor_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vnor.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vnor.v(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-nori.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-nori.ll
+new file mode 100644
+index 000000000000..c2388a1e0da3
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-nori.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vnori.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vnori_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vnori_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vnori.b $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vnori.b(<16 x i8> %va, i32 1)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-or.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-or.ll
+new file mode 100644
+index 000000000000..ab557003d150
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-or.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vor.v(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vor_v(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vor_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vor.v(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ori.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ori.ll
+new file mode 100644
+index 000000000000..85c0f432c54a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ori.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vori.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vori_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vori_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vori.b $vr0, $vr0, 3
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vori.b(<16 x i8> %va, i32 3)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-orn.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-orn.ll
+new file mode 100644
+index 000000000000..4528628e02c3
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-orn.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vorn.v(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vorn_v(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vorn_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vorn.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vorn.v(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pack.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pack.ll
+new file mode 100644
+index 000000000000..70a3620d1757
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pack.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vpackev.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vpackev_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vpackev_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpackev.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vpackev.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vpackev.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vpackev_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vpackev_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpackev.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vpackev.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vpackev.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vpackev_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vpackev_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpackev.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vpackev.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vpackev.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vpackev_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vpackev_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpackev.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vpackev.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vpackod.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vpackod_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vpackod_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpackod.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vpackod.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vpackod.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vpackod_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vpackod_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpackod.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vpackod.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vpackod.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vpackod_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vpackod_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpackod.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vpackod.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vpackod.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vpackod_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vpackod_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpackod.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vpackod.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pcnt.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pcnt.ll
+new file mode 100644
+index 000000000000..431b270ab0a1
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pcnt.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vpcnt.b(<16 x i8>)
++
++define <16 x i8> @lsx_vpcnt_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vpcnt_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpcnt.b $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vpcnt.b(<16 x i8> %va)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vpcnt.h(<8 x i16>)
++
++define <8 x i16> @lsx_vpcnt_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vpcnt_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpcnt.h $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vpcnt.h(<8 x i16> %va)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vpcnt.w(<4 x i32>)
++
++define <4 x i32> @lsx_vpcnt_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vpcnt_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpcnt.w $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vpcnt.w(<4 x i32> %va)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vpcnt.d(<2 x i64>)
++
++define <2 x i64> @lsx_vpcnt_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vpcnt_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpcnt.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vpcnt.d(<2 x i64> %va)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-permi.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-permi.ll
+new file mode 100644
+index 000000000000..b8367d98caf6
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-permi.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <4 x i32> @llvm.loongarch.lsx.vpermi.w(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vpermi_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vpermi_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpermi.w $vr0, $vr1, 255
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vpermi.w(<4 x i32> %va, <4 x i32> %vb, i32 255)
++  ret <4 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pick.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pick.ll
+new file mode 100644
+index 000000000000..4ebf29e1409c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pick.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vpickev.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vpickev_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vpickev_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpickev.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vpickev.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vpickev.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vpickev_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vpickev_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpickev.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vpickev.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vpickev.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vpickev_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vpickev_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpickev.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vpickev.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vpickev.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vpickev_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vpickev_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpickev.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vpickev.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vpickod.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vpickod_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vpickod_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpickod.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vpickod.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vpickod.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vpickod_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vpickod_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpickod.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vpickod.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vpickod.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vpickod_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vpickod_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpickod.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vpickod.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vpickod.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vpickod_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vpickod_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpickod.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vpickod.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr.ll
+new file mode 100644
+index 000000000000..ed56d30ce3c4
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8>, i32)
++
++define i32 @lsx_vpickve2gr_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vpickve2gr_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpickve2gr.b $a0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8> %va, i32 15)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.h(<8 x i16>, i32)
++
++define i32 @lsx_vpickve2gr_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vpickve2gr_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpickve2gr.h $a0, $vr0, 7
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.h(<8 x i16> %va, i32 7)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.w(<4 x i32>, i32)
++
++define i32 @lsx_vpickve2gr_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vpickve2gr_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 3
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.w(<4 x i32> %va, i32 3)
++  ret i32 %res
++}
++
++declare i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64>, i32)
++
++define i64 @lsx_vpickve2gr_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vpickve2gr_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> %va, i32 1)
++  ret i64 %res
++}
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8>, i32)
++
++define i32 @lsx_vpickve2gr_bu(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vpickve2gr_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpickve2gr.bu $a0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8> %va, i32 15)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.hu(<8 x i16>, i32)
++
++define i32 @lsx_vpickve2gr_hu(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vpickve2gr_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 7
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.hu(<8 x i16> %va, i32 7)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32>, i32)
++
++define i32 @lsx_vpickve2gr_wu(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vpickve2gr_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpickve2gr.wu $a0, $vr0, 3
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> %va, i32 3)
++  ret i32 %res
++}
++
++declare i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64>, i32)
++
++define i64 @lsx_vpickve2gr_du(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vpickve2gr_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vpickve2gr.du $a0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> %va, i32 1)
++  ret i64 %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr.ll
+new file mode 100644
+index 000000000000..091f1c98c228
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vreplgr2vr.b(i32)
++
++define <16 x i8> @lsx_vreplgr2vr_b(i32 %a) nounwind {
++; CHECK-LABEL: lsx_vreplgr2vr_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vreplgr2vr.b $vr0, $a0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vreplgr2vr.b(i32 %a)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vreplgr2vr.h(i32)
++
++define <8 x i16> @lsx_vreplgr2vr_h(i32 %a) nounwind {
++; CHECK-LABEL: lsx_vreplgr2vr_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vreplgr2vr.h $vr0, $a0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vreplgr2vr.h(i32 %a)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vreplgr2vr.w(i32)
++
++define <4 x i32> @lsx_vreplgr2vr_w(i32 %a) nounwind {
++; CHECK-LABEL: lsx_vreplgr2vr_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vreplgr2vr.w $vr0, $a0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vreplgr2vr.w(i32 %a)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64)
++
++define <2 x i64> @lsx_vreplgr2vr_d(i64 %a) nounwind {
++; CHECK-LABEL: lsx_vreplgr2vr_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vreplgr2vr.d $vr0, $a0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64 %a)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replve.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replve.ll
+new file mode 100644
+index 000000000000..3ba184dad052
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replve.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vreplve.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vreplve_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK-LABEL: lsx_vreplve_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vreplve.b $vr0, $vr0, $a0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vreplve.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vreplve.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vreplve_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK-LABEL: lsx_vreplve_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vreplve.h $vr0, $vr0, $a0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vreplve.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vreplve.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vreplve_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK-LABEL: lsx_vreplve_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vreplve.w $vr0, $vr0, $a0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vreplve.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vreplve.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vreplve_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK-LABEL: lsx_vreplve_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vreplve.d $vr0, $vr0, $a0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vreplve.d(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replvei.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replvei.ll
+new file mode 100644
+index 000000000000..9b8af1878cb8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replvei.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vreplvei.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vreplvei_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vreplvei_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vreplvei.b $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vreplvei.b(<16 x i8> %va, i32 15)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vreplvei.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vreplvei_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vreplvei_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vreplvei.h $vr0, $vr0, 7
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vreplvei.h(<8 x i16> %va, i32 7)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vreplvei.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vreplvei_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vreplvei_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 3
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vreplvei.w(<4 x i32> %va, i32 3)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vreplvei.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vreplvei_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vreplvei_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vreplvei.d(<2 x i64> %va, i32 1)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-rotr.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-rotr.ll
+new file mode 100644
+index 000000000000..df8650677147
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-rotr.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vrotr.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vrotr_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vrotr_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vrotr.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vrotr.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vrotr.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vrotr_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vrotr_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vrotr.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vrotr.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vrotr.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vrotr_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vrotr_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vrotr.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vrotr.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vrotr.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vrotr_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vrotr_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vrotr.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vrotr.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vrotri.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vrotri_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vrotri_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vrotri.b $vr0, $vr0, 7
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vrotri.b(<16 x i8> %va, i32 7)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vrotri.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vrotri_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vrotri_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vrotri.h $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vrotri.h(<8 x i16> %va, i32 15)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vrotri.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vrotri_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vrotri_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vrotri.w $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vrotri.w(<4 x i32> %va, i32 31)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vrotri.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vrotri_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vrotri_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vrotri.d $vr0, $vr0, 63
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vrotri.d(<2 x i64> %va, i32 63)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sadd.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sadd.ll
+new file mode 100644
+index 000000000000..a54f955766df
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sadd.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsadd.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vsadd_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsadd_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsadd.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsadd.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsadd.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vsadd_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsadd_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsadd.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsadd.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsadd.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vsadd_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsadd_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsadd.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsadd.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsadd.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vsadd_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsadd_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsadd.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsadd.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vsadd.bu(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vsadd_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsadd_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsadd.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsadd.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsadd.hu(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vsadd_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsadd_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsadd.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsadd.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsadd.wu(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vsadd_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsadd_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsadd.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsadd.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsadd.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vsadd_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsadd_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsadd.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsadd.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sat.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sat.ll
+new file mode 100644
+index 000000000000..4286842a63b9
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sat.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsat.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsat_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vsat_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsat.b $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsat.b(<16 x i8> %va, i32 1)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsat.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsat_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vsat_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsat.h $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsat.h(<8 x i16> %va, i32 1)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsat.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsat_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vsat_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsat.w $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsat.w(<4 x i32> %va, i32 1)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsat.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsat_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vsat_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsat.d $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsat.d(<2 x i64> %va, i32 1)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vsat.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsat_bu(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vsat_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsat.bu $vr0, $vr0, 7
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsat.bu(<16 x i8> %va, i32 7)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsat.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsat_hu(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vsat_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsat.hu $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsat.hu(<8 x i16> %va, i32 15)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsat.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsat_wu(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vsat_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsat.wu $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsat.wu(<4 x i32> %va, i32 31)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsat.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsat_du(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vsat_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsat.du $vr0, $vr0, 63
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsat.du(<2 x i64> %va, i32 63)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-seq.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-seq.ll
+new file mode 100644
+index 000000000000..3cb4acd82439
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-seq.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vseq.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vseq_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vseq_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vseq.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vseq.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vseq.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vseq_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vseq_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vseq.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vseq.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vseq.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vseq_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vseq_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vseq.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vseq.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vseq.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vseq_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vseq_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vseq.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vseq.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vseqi.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vseqi_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vseqi_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vseqi.b $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vseqi.b(<16 x i8> %va, i32 15)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vseqi.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vseqi_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vseqi_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vseqi.h $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vseqi.h(<8 x i16> %va, i32 15)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vseqi.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vseqi_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vseqi_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vseqi.w $vr0, $vr0, -16
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vseqi.w(<4 x i32> %va, i32 -16)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vseqi.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vseqi_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vseqi_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vseqi.d $vr0, $vr0, -16
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vseqi.d(<2 x i64> %va, i32 -16)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-set.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-set.ll
+new file mode 100644
+index 000000000000..3188fb4e2c2e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-set.ll
+@@ -0,0 +1,38 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare i32 @llvm.loongarch.lsx.bz.v(<16 x i8>)
++
++define i32 @lsx_bz_v(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_bz_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vseteqz.v $fcc0, $vr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB0_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB0_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lsx.bz.v(<16 x i8> %va)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.bnz.v(<16 x i8>)
++
++define i32 @lsx_bnz_v(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_bnz_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsetnez.v $fcc0, $vr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB1_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB1_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lsx.bnz.v(<16 x i8> %va)
++  ret i32 %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setallnez.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setallnez.ll
+new file mode 100644
+index 000000000000..22e01922e87b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setallnez.ll
+@@ -0,0 +1,74 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare i32 @llvm.loongarch.lsx.bnz.b(<16 x i8>)
++
++define i32 @lsx_bnz_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_bnz_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsetallnez.b $fcc0, $vr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB0_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB0_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lsx.bnz.b(<16 x i8> %va)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.bnz.h(<8 x i16>)
++
++define i32 @lsx_bnz_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_bnz_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsetallnez.h $fcc0, $vr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB1_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB1_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lsx.bnz.h(<8 x i16> %va)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.bnz.w(<4 x i32>)
++
++define i32 @lsx_bnz_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_bnz_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsetallnez.w $fcc0, $vr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB2_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB2_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lsx.bnz.w(<4 x i32> %va)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.bnz.d(<2 x i64>)
++
++define i32 @lsx_bnz_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_bnz_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsetallnez.d $fcc0, $vr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB3_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB3_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lsx.bnz.d(<2 x i64> %va)
++  ret i32 %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setanyeqz.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setanyeqz.ll
+new file mode 100644
+index 000000000000..96c79c10e468
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setanyeqz.ll
+@@ -0,0 +1,74 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare i32 @llvm.loongarch.lsx.bz.b(<16 x i8>)
++
++define i32 @lsx_bz_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_bz_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsetanyeqz.b $fcc0, $vr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB0_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB0_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lsx.bz.b(<16 x i8> %va)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.bz.h(<8 x i16>)
++
++define i32 @lsx_bz_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_bz_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsetanyeqz.h $fcc0, $vr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB1_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB1_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lsx.bz.h(<8 x i16> %va)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.bz.w(<4 x i32>)
++
++define i32 @lsx_bz_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_bz_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsetanyeqz.w $fcc0, $vr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB2_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB2_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lsx.bz.w(<4 x i32> %va)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.bz.d(<2 x i64>)
++
++define i32 @lsx_bz_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_bz_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsetanyeqz.d $fcc0, $vr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB3_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB3_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lsx.bz.d(<2 x i64> %va)
++  ret i32 %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-shuf.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-shuf.ll
+new file mode 100644
+index 000000000000..f5d516521e45
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-shuf.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vshuf.b(<16 x i8>, <16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vshuf_b(<16 x i8> %va, <16 x i8> %vb, <16 x i8> %vc) nounwind {
++; CHECK-LABEL: lsx_vshuf_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vshuf.b $vr0, $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vshuf.b(<16 x i8> %va, <16 x i8> %vb, <16 x i8> %vc)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vshuf.h(<8 x i16>, <8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vshuf_h(<8 x i16> %va, <8 x i16> %vb, <8 x i16> %vc) nounwind {
++; CHECK-LABEL: lsx_vshuf_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vshuf.h $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vshuf.h(<8 x i16> %va, <8 x i16> %vb, <8 x i16> %vc)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vshuf.w(<4 x i32>, <4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vshuf_w(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc) nounwind {
++; CHECK-LABEL: lsx_vshuf_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vshuf.w $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vshuf.w(<4 x i32> %va, <4 x i32> %vb, <4 x i32> %vc)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vshuf.d(<2 x i64>, <2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vshuf_d(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc) nounwind {
++; CHECK-LABEL: lsx_vshuf_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vshuf.d $vr0, $vr1, $vr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vshuf.d(<2 x i64> %va, <2 x i64> %vb, <2 x i64> %vc)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-shuf4i.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-shuf4i.ll
+new file mode 100644
+index 000000000000..1ad5f2af5591
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-shuf4i.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vshuf4i.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vshuf4i_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vshuf4i_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 255
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vshuf4i.b(<16 x i8> %va, i32 255)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vshuf4i.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vshuf4i_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vshuf4i_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vshuf4i.h $vr0, $vr0, 255
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vshuf4i.h(<8 x i16> %va, i32 255)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vshuf4i.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vshuf4i_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vshuf4i_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 255
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vshuf4i.w(<4 x i32> %va, i32 255)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vshuf4i.d(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vshuf4i_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vshuf4i_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vshuf4i.d $vr0, $vr1, 255
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vshuf4i.d(<2 x i64> %va, <2 x i64> %vb, i32 255)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-signcov.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-signcov.ll
+new file mode 100644
+index 000000000000..3997b0cc995c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-signcov.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsigncov.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vsigncov_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsigncov_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsigncov.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsigncov.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsigncov.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vsigncov_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsigncov_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsigncov.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsigncov.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsigncov.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vsigncov_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsigncov_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsigncov.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsigncov.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsigncov.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vsigncov_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsigncov_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsigncov.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsigncov.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sle.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sle.ll
+new file mode 100644
+index 000000000000..5a9d5f06e63f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sle.ll
+@@ -0,0 +1,194 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsle.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vsle_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsle_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsle.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsle.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsle.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vsle_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsle_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsle.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsle.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsle.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vsle_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsle_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsle.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsle.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsle.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vsle_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsle_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsle.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsle.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vslei.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vslei_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vslei_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslei.b $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslei.b(<16 x i8> %va, i32 15)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vslei.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vslei_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vslei_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslei.h $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslei.h(<8 x i16> %va, i32 15)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vslei.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vslei_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vslei_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslei.w $vr0, $vr0, -16
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslei.w(<4 x i32> %va, i32 -16)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vslei.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vslei_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vslei_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslei.d $vr0, $vr0, -16
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslei.d(<2 x i64> %va, i32 -16)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vsle.bu(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vsle_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsle_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsle.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsle.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsle.hu(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vsle_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsle_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsle.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsle.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsle.wu(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vsle_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsle_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsle.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsle.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsle.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vsle_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsle_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsle.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsle.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vslei.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vslei_bu(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vslei_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslei.bu $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslei.bu(<16 x i8> %va, i32 1)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vslei.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vslei_hu(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vslei_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslei.hu $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslei.hu(<8 x i16> %va, i32 1)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vslei.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vslei_wu(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vslei_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslei.wu $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslei.wu(<4 x i32> %va, i32 31)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vslei.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vslei_du(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vslei_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslei.du $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslei.du(<2 x i64> %va, i32 31)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sll.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sll.ll
+new file mode 100644
+index 000000000000..7bc20af41f17
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sll.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsll.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vsll_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsll_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsll.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsll.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsll.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vsll_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsll_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsll.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsll.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsll.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vsll_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsll_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsll.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsll.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsll.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vsll_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsll_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsll.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsll.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vslli.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vslli_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vslli_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslli.b $vr0, $vr0, 7
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslli.b(<16 x i8> %va, i32 7)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vslli.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vslli_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vslli_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslli.h $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslli.h(<8 x i16> %va, i32 15)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vslli.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vslli_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vslli_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslli.w $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslli.w(<4 x i32> %va, i32 31)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vslli.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vslli_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vslli_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslli.d $vr0, $vr0, 63
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslli.d(<2 x i64> %va, i32 63)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sllwil.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sllwil.ll
+new file mode 100644
+index 000000000000..29ab70da1ced
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sllwil.ll
+@@ -0,0 +1,74 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <8 x i16> @llvm.loongarch.lsx.vsllwil.h.b(<16 x i8>, i32)
++
++define <8 x i16> @lsx_vsllwil_h_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vsllwil_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsllwil.h.b $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsllwil.h.b(<16 x i8> %va, i32 1)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsllwil.w.h(<8 x i16>, i32)
++
++define <4 x i32> @lsx_vsllwil_w_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vsllwil_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsllwil.w.h $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsllwil.w.h(<8 x i16> %va, i32 1)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsllwil.d.w(<4 x i32>, i32)
++
++define <2 x i64> @lsx_vsllwil_d_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vsllwil_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsllwil.d.w $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsllwil.d.w(<4 x i32> %va, i32 1)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsllwil.hu.bu(<16 x i8>, i32)
++
++define <8 x i16> @lsx_vsllwil_hu_bu(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vsllwil_hu_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsllwil.hu.bu $vr0, $vr0, 7
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsllwil.hu.bu(<16 x i8> %va, i32 7)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsllwil.wu.hu(<8 x i16>, i32)
++
++define <4 x i32> @lsx_vsllwil_wu_hu(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vsllwil_wu_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsllwil.wu.hu $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsllwil.wu.hu(<8 x i16> %va, i32 15)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsllwil.du.wu(<4 x i32>, i32)
++
++define <2 x i64> @lsx_vsllwil_du_wu(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vsllwil_du_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsllwil.du.wu $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsllwil.du.wu(<4 x i32> %va, i32 31)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-slt.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-slt.ll
+new file mode 100644
+index 000000000000..18683e9dc46f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-slt.ll
+@@ -0,0 +1,194 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vslt.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vslt_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vslt_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslt.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslt.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vslt.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vslt_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vslt_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslt.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslt.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vslt.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vslt_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vslt_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslt.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslt.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vslt.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vslt_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vslt_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslt.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslt.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vslti.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vslti_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vslti_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslti.b $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslti.b(<16 x i8> %va, i32 15)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vslti.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vslti_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vslti_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslti.h $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslti.h(<8 x i16> %va, i32 15)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vslti.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vslti_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vslti_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslti.w $vr0, $vr0, -16
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslti.w(<4 x i32> %va, i32 -16)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vslti.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vslti_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vslti_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslti.d $vr0, $vr0, -16
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslti.d(<2 x i64> %va, i32 -16)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vslt.bu(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vslt_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vslt_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslt.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslt.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vslt.hu(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vslt_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vslt_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslt.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslt.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vslt.wu(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vslt_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vslt_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslt.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslt.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vslt.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vslt_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vslt_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslt.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslt.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vslti.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vslti_bu(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vslti_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslti.bu $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslti.bu(<16 x i8> %va, i32 1)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vslti.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vslti_hu(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vslti_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslti.hu $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslti.hu(<8 x i16> %va, i32 1)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vslti.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vslti_wu(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vslti_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslti.wu $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslti.wu(<4 x i32> %va, i32 31)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vslti.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vslti_du(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vslti_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vslti.du $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslti.du(<2 x i64> %va, i32 31)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sra.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sra.ll
+new file mode 100644
+index 000000000000..e85c8464c18e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sra.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsra.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vsra_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsra_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsra.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsra.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsra.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vsra_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsra_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsra.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsra.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsra.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vsra_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsra_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsra.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsra.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsra.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vsra_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsra_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsra.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsra.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrai.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrai_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vsrai_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrai.b $vr0, $vr0, 7
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrai.b(<16 x i8> %va, i32 7)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrai.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrai_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vsrai_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrai.h $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrai.h(<8 x i16> %va, i32 15)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrai.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrai_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vsrai_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrai.w $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrai.w(<4 x i32> %va, i32 31)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrai.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrai_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vsrai_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrai.d $vr0, $vr0, 63
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrai.d(<2 x i64> %va, i32 63)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sran.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sran.ll
+new file mode 100644
+index 000000000000..4ffe5a704c2c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sran.ll
+@@ -0,0 +1,38 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsran.b.h(<8 x i16>, <8 x i16>)
++
++define <16 x i8> @lsx_vsran_b_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsran_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsran.b.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsran.b.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsran.h.w(<4 x i32>, <4 x i32>)
++
++define <8 x i16> @lsx_vsran_h_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsran_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsran.h.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsran.h.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsran.w.d(<2 x i64>, <2 x i64>)
++
++define <4 x i32> @lsx_vsran_w_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsran_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsran.w.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsran.w.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <4 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srani.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srani.ll
+new file mode 100644
+index 000000000000..717c641616c8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srani.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrani.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrani_b_h(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrani_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrani.b.h $vr0, $vr1, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrani.b.h(<16 x i8> %va, <16 x i8> %vb, i32 15)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrani.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrani_h_w(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrani_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrani.h.w $vr0, $vr1, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrani.h.w(<8 x i16> %va, <8 x i16> %vb, i32 31)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrani.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrani_w_d(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrani_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrani.w.d $vr0, $vr1, 63
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrani.w.d(<4 x i32> %va, <4 x i32> %vb, i32 63)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrani.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrani_d_q(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrani_d_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrani.d.q $vr0, $vr1, 127
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrani.d.q(<2 x i64> %va, <2 x i64> %vb, i32 127)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srar.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srar.ll
+new file mode 100644
+index 000000000000..8b52b7ac9631
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srar.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrar.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vsrar_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrar_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrar.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrar.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrar.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vsrar_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrar_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrar.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrar.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrar.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vsrar_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrar_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrar.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrar.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrar.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vsrar_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrar_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrar.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrar.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrari.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrari_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vsrari_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrari.b $vr0, $vr0, 7
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrari.b(<16 x i8> %va, i32 7)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrari.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrari_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vsrari_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrari.h $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrari.h(<8 x i16> %va, i32 15)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrari.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrari_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vsrari_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrari.w $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrari.w(<4 x i32> %va, i32 31)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrari.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrari_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vsrari_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrari.d $vr0, $vr0, 63
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrari.d(<2 x i64> %va, i32 63)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srarn.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srarn.ll
+new file mode 100644
+index 000000000000..d4cdfb5359ea
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srarn.ll
+@@ -0,0 +1,38 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrarn.b.h(<8 x i16>, <8 x i16>)
++
++define <16 x i8> @lsx_vsrarn_b_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrarn_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrarn.b.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrarn.b.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrarn.h.w(<4 x i32>, <4 x i32>)
++
++define <8 x i16> @lsx_vsrarn_h_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrarn_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrarn.h.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrarn.h.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrarn.w.d(<2 x i64>, <2 x i64>)
++
++define <4 x i32> @lsx_vsrarn_w_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrarn_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrarn.w.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrarn.w.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <4 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srarni.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srarni.ll
+new file mode 100644
+index 000000000000..2253e88372fc
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srarni.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrarni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrarni_b_h(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrarni_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrarni.b.h $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrarni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 1)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrarni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrarni_h_w(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrarni_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrarni.h.w $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrarni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 1)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrarni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrarni_w_d(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrarni_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrarni.w.d $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrarni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 1)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrarni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrarni_d_q(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrarni_d_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrarni.d.q $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrarni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 1)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srl.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srl.ll
+new file mode 100644
+index 000000000000..1cddd9622233
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srl.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrl.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vsrl_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrl_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrl.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrl.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrl.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vsrl_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrl_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrl.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrl.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrl.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vsrl_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrl_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrl.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrl.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrl.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vsrl_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrl_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrl.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrl.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrli.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrli_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vsrli_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrli.b $vr0, $vr0, 7
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrli.b(<16 x i8> %va, i32 7)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrli.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrli_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vsrli_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrli.h $vr0, $vr0, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrli.h(<8 x i16> %va, i32 15)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrli.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrli_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vsrli_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrli.w $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrli.w(<4 x i32> %va, i32 31)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrli.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrli_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vsrli_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrli.d $vr0, $vr0, 63
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrli.d(<2 x i64> %va, i32 63)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srln.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srln.ll
+new file mode 100644
+index 000000000000..1c9b23243ffb
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srln.ll
+@@ -0,0 +1,38 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrln.b.h(<8 x i16>, <8 x i16>)
++
++define <16 x i8> @lsx_vsrln_b_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrln_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrln.b.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrln.b.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrln.h.w(<4 x i32>, <4 x i32>)
++
++define <8 x i16> @lsx_vsrln_h_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrln_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrln.h.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrln.h.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrln.w.d(<2 x i64>, <2 x i64>)
++
++define <4 x i32> @lsx_vsrln_w_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrln_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrln.w.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrln.w.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <4 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlni.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlni.ll
+new file mode 100644
+index 000000000000..6e523efa1824
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlni.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrlni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrlni_b_h(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrlni_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlni.b.h $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrlni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 1)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrlni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrlni_h_w(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrlni_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlni.h.w $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrlni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 1)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrlni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrlni_w_d(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrlni_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlni.w.d $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrlni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 1)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrlni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrlni_d_q(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrlni_d_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlni.d.q $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrlni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 1)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlr.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlr.ll
+new file mode 100644
+index 000000000000..51638fa1a47f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlr.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrlr.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vsrlr_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrlr_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlr.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrlr.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrlr.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vsrlr_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrlr_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlr.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrlr.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrlr.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vsrlr_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrlr_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlr.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrlr.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrlr.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vsrlr_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrlr_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlr.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrlr.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrlri.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrlri_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vsrlri_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlri.b $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrlri.b(<16 x i8> %va, i32 1)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrlri.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrlri_h(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vsrlri_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlri.h $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrlri.h(<8 x i16> %va, i32 1)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrlri.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrlri_w(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vsrlri_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlri.w $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrlri.w(<4 x i32> %va, i32 1)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrlri.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrlri_d(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vsrlri_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlri.d $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrlri.d(<2 x i64> %va, i32 1)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlrn.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlrn.ll
+new file mode 100644
+index 000000000000..893e51396241
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlrn.ll
+@@ -0,0 +1,38 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrlrn.b.h(<8 x i16>, <8 x i16>)
++
++define <16 x i8> @lsx_vsrlrn_b_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrlrn_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlrn.b.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrlrn.b.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrlrn.h.w(<4 x i32>, <4 x i32>)
++
++define <8 x i16> @lsx_vsrlrn_h_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrlrn_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlrn.h.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrlrn.h.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrlrn.w.d(<2 x i64>, <2 x i64>)
++
++define <4 x i32> @lsx_vsrlrn_w_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrlrn_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlrn.w.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrlrn.w.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <4 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlrni.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlrni.ll
+new file mode 100644
+index 000000000000..d1ea450d2237
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlrni.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrlrni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrlrni_b_h(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrlrni_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlrni.b.h $vr0, $vr1, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrlrni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 15)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrlrni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrlrni_h_w(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrlrni_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlrni.h.w $vr0, $vr1, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrlrni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 31)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrlrni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrlrni_w_d(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrlrni_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlrni.w.d $vr0, $vr1, 63
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrlrni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 63)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrlrni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrlrni_d_q(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsrlrni_d_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsrlrni.d.q $vr0, $vr1, 127
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrlrni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 127)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssran.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssran.ll
+new file mode 100644
+index 000000000000..cecccbb730c9
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssran.ll
+@@ -0,0 +1,74 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vssran.b.h(<8 x i16>, <8 x i16>)
++
++define <16 x i8> @lsx_vssran_b_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssran_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssran.b.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssran.b.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssran.h.w(<4 x i32>, <4 x i32>)
++
++define <8 x i16> @lsx_vssran_h_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssran_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssran.h.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssran.h.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssran.w.d(<2 x i64>, <2 x i64>)
++
++define <4 x i32> @lsx_vssran_w_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssran_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssran.w.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssran.w.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <4 x i32> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vssran.bu.h(<8 x i16>, <8 x i16>)
++
++define <16 x i8> @lsx_vssran_bu_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssran_bu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssran.bu.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssran.bu.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssran.hu.w(<4 x i32>, <4 x i32>)
++
++define <8 x i16> @lsx_vssran_hu_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssran_hu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssran.hu.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssran.hu.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssran.wu.d(<2 x i64>, <2 x i64>)
++
++define <4 x i32> @lsx_vssran_wu_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssran_wu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssran.wu.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssran.wu.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <4 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrani.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrani.ll
+new file mode 100644
+index 000000000000..57b8eb169866
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrani.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrani.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrani_b_h(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrani_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrani.b.h $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrani.b.h(<16 x i8> %va, <16 x i8> %vb, i32 1)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrani.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrani_h_w(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrani_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrani.h.w $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrani.h.w(<8 x i16> %va, <8 x i16> %vb, i32 1)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrani.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrani_w_d(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrani_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrani.w.d $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrani.w.d(<4 x i32> %va, <4 x i32> %vb, i32 1)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrani.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrani_d_q(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrani_d_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrani.d.q $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrani.d.q(<2 x i64> %va, <2 x i64> %vb, i32 1)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrani.bu.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrani_bu_h(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrani_bu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrani.bu.h $vr0, $vr1, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrani.bu.h(<16 x i8> %va, <16 x i8> %vb, i32 15)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrani.hu.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrani_hu_w(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrani_hu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrani.hu.w $vr0, $vr1, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrani.hu.w(<8 x i16> %va, <8 x i16> %vb, i32 31)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrani.wu.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrani_wu_d(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrani_wu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrani.wu.d $vr0, $vr1, 63
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrani.wu.d(<4 x i32> %va, <4 x i32> %vb, i32 63)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrani.du.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrani_du_q(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrani_du_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrani.du.q $vr0, $vr1, 127
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrani.du.q(<2 x i64> %va, <2 x i64> %vb, i32 127)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrarn.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrarn.ll
+new file mode 100644
+index 000000000000..c6b7d9ec8e1d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrarn.ll
+@@ -0,0 +1,74 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrarn.b.h(<8 x i16>, <8 x i16>)
++
++define <16 x i8> @lsx_vssrarn_b_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrarn_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrarn.b.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrarn.b.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrarn.h.w(<4 x i32>, <4 x i32>)
++
++define <8 x i16> @lsx_vssrarn_h_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrarn_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrarn.h.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrarn.h.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrarn.w.d(<2 x i64>, <2 x i64>)
++
++define <4 x i32> @lsx_vssrarn_w_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrarn_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrarn.w.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrarn.w.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <4 x i32> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrarn.bu.h(<8 x i16>, <8 x i16>)
++
++define <16 x i8> @lsx_vssrarn_bu_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrarn_bu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrarn.bu.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrarn.bu.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrarn.hu.w(<4 x i32>, <4 x i32>)
++
++define <8 x i16> @lsx_vssrarn_hu_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrarn_hu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrarn.hu.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrarn.hu.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrarn.wu.d(<2 x i64>, <2 x i64>)
++
++define <4 x i32> @lsx_vssrarn_wu_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrarn_wu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrarn.wu.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrarn.wu.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <4 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrarni.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrarni.ll
+new file mode 100644
+index 000000000000..1a2e91962ac3
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrarni.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrarni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrarni_b_h(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrarni_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrarni.b.h $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrarni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 1)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrarni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrarni_h_w(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrarni_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrarni.h.w $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrarni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 1)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrarni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrarni_w_d(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrarni_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrarni.w.d $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrarni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 1)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrarni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrarni_d_q(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrarni_d_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrarni.d.q $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrarni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 1)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrarni.bu.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrarni_bu_h(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrarni_bu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrarni.bu.h $vr0, $vr1, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrarni.bu.h(<16 x i8> %va, <16 x i8> %vb, i32 15)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrarni.hu.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrarni_hu_w(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrarni_hu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrarni.hu.w $vr0, $vr1, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrarni.hu.w(<8 x i16> %va, <8 x i16> %vb, i32 31)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrarni.wu.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrarni_wu_d(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrarni_wu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrarni.wu.d $vr0, $vr1, 63
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrarni.wu.d(<4 x i32> %va, <4 x i32> %vb, i32 63)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrarni.du.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrarni_du_q(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrarni_du_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrarni.du.q $vr0, $vr1, 127
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrarni.du.q(<2 x i64> %va, <2 x i64> %vb, i32 127)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrln.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrln.ll
+new file mode 100644
+index 000000000000..697ccc3962a8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrln.ll
+@@ -0,0 +1,74 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrln.b.h(<8 x i16>, <8 x i16>)
++
++define <16 x i8> @lsx_vssrln_b_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrln_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrln.b.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrln.b.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrln.h.w(<4 x i32>, <4 x i32>)
++
++define <8 x i16> @lsx_vssrln_h_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrln_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrln.h.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrln.h.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrln.w.d(<2 x i64>, <2 x i64>)
++
++define <4 x i32> @lsx_vssrln_w_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrln_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrln.w.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrln.w.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <4 x i32> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrln.bu.h(<8 x i16>, <8 x i16>)
++
++define <16 x i8> @lsx_vssrln_bu_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrln_bu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrln.bu.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrln.bu.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrln.hu.w(<4 x i32>, <4 x i32>)
++
++define <8 x i16> @lsx_vssrln_hu_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrln_hu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrln.hu.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrln.hu.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrln.wu.d(<2 x i64>, <2 x i64>)
++
++define <4 x i32> @lsx_vssrln_wu_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrln_wu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrln.wu.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrln.wu.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <4 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlni.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlni.ll
+new file mode 100644
+index 000000000000..8dd41e7abe87
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlni.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrlni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrlni_b_h(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlni_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlni.b.h $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 1)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrlni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrlni_h_w(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlni_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlni.h.w $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 1)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrlni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrlni_w_d(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlni_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlni.w.d $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 1)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrlni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrlni_d_q(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlni_d_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlni.d.q $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrlni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 1)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrlni.bu.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrlni_bu_h(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlni_bu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlni.bu.h $vr0, $vr1, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlni.bu.h(<16 x i8> %va, <16 x i8> %vb, i32 15)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrlni.hu.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrlni_hu_w(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlni_hu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlni.hu.w $vr0, $vr1, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlni.hu.w(<8 x i16> %va, <8 x i16> %vb, i32 31)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrlni.wu.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrlni_wu_d(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlni_wu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlni.wu.d $vr0, $vr1, 63
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlni.wu.d(<4 x i32> %va, <4 x i32> %vb, i32 63)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrlni.du.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrlni_du_q(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlni_du_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlni.du.q $vr0, $vr1, 127
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrlni.du.q(<2 x i64> %va, <2 x i64> %vb, i32 127)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlrn.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlrn.ll
+new file mode 100644
+index 000000000000..a8e76cbaa7fd
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlrn.ll
+@@ -0,0 +1,74 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrlrn.b.h(<8 x i16>, <8 x i16>)
++
++define <16 x i8> @lsx_vssrlrn_b_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlrn_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlrn.b.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlrn.b.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrlrn.h.w(<4 x i32>, <4 x i32>)
++
++define <8 x i16> @lsx_vssrlrn_h_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlrn_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlrn.h.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlrn.h.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrlrn.w.d(<2 x i64>, <2 x i64>)
++
++define <4 x i32> @lsx_vssrlrn_w_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlrn_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlrn.w.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlrn.w.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <4 x i32> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrlrn.bu.h(<8 x i16>, <8 x i16>)
++
++define <16 x i8> @lsx_vssrlrn_bu_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlrn_bu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlrn.bu.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlrn.bu.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrlrn.hu.w(<4 x i32>, <4 x i32>)
++
++define <8 x i16> @lsx_vssrlrn_hu_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlrn_hu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlrn.hu.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlrn.hu.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrlrn.wu.d(<2 x i64>, <2 x i64>)
++
++define <4 x i32> @lsx_vssrlrn_wu_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlrn_wu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlrn.wu.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlrn.wu.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <4 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlrni.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlrni.ll
+new file mode 100644
+index 000000000000..869e81b2b09d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlrni.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrlrni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrlrni_b_h(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlrni_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlrni.b.h $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlrni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 1)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrlrni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrlrni_h_w(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlrni_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlrni.h.w $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlrni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 1)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrlrni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrlrni_w_d(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlrni_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlrni.w.d $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlrni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 1)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrlrni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrlrni_d_q(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlrni_d_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlrni.d.q $vr0, $vr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrlrni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 1)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrlrni.bu.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrlrni_bu_h(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlrni_bu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlrni.bu.h $vr0, $vr1, 15
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlrni.bu.h(<16 x i8> %va, <16 x i8> %vb, i32 15)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrlrni.hu.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrlrni_hu_w(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlrni_hu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlrni.hu.w $vr0, $vr1, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlrni.hu.w(<8 x i16> %va, <8 x i16> %vb, i32 31)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrlrni.wu.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrlrni_wu_d(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlrni_wu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlrni.wu.d $vr0, $vr1, 63
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlrni.wu.d(<4 x i32> %va, <4 x i32> %vb, i32 63)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrlrni.du.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrlrni_du_q(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssrlrni_du_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssrlrni.du.q $vr0, $vr1, 127
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrlrni.du.q(<2 x i64> %va, <2 x i64> %vb, i32 127)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssub.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssub.ll
+new file mode 100644
+index 000000000000..c594b426d650
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssub.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vssub.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vssub_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vssub_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssub.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssub.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssub.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vssub_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssub_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssub.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssub.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssub.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vssub_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssub_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssub.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssub.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssub.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vssub_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssub_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssub.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssub.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vssub.bu(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vssub_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vssub_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssub.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssub.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssub.hu(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vssub_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vssub_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssub.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssub.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssub.wu(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vssub_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vssub_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssub.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssub.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssub.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vssub_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vssub_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vssub.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssub.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-st.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-st.ll
+new file mode 100644
+index 000000000000..798f509f2318
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-st.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare void @llvm.loongarch.lsx.vst(<16 x i8>, i8*, i32)
++
++define void @lsx_vst(<16 x i8> %va, i8* %p) nounwind {
++; CHECK-LABEL: lsx_vst:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vst $vr0, $a0, -2048
++; CHECK-NEXT:    ret
++entry:
++  call void @llvm.loongarch.lsx.vst(<16 x i8> %va, i8* %p, i32 -2048)
++  ret void
++}
++
++declare void @llvm.loongarch.lsx.vstx(<16 x i8>, i8*, i64)
++
++define void @lsx_vstx(<16 x i8> %va, i8* %p, i64 %c) nounwind {
++; CHECK-LABEL: lsx_vstx:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vstx $vr0, $a0, $a1
++; CHECK-NEXT:    ret
++entry:
++  call void @llvm.loongarch.lsx.vstx(<16 x i8> %va, i8* %p, i64 %c)
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-stelm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-stelm.ll
+new file mode 100644
+index 000000000000..6b9e7a9d7462
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-stelm.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare void @llvm.loongarch.lsx.vstelm.b(<16 x i8>, i8*, i32, i32)
++
++define void @lsx_vstelm_b(<16 x i8> %va, i8* %p) nounwind {
++; CHECK-LABEL: lsx_vstelm_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vstelm.b $vr0, $a0, 1, 15
++; CHECK-NEXT:    ret
++entry:
++  call void @llvm.loongarch.lsx.vstelm.b(<16 x i8> %va, i8* %p, i32 1, i32 15)
++  ret void
++}
++
++declare void @llvm.loongarch.lsx.vstelm.h(<8 x i16>, i8*, i32, i32)
++
++define void @lsx_vstelm_h(<8 x i16> %va, i8* %p) nounwind {
++; CHECK-LABEL: lsx_vstelm_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vstelm.h $vr0, $a0, 2, 7
++; CHECK-NEXT:    ret
++entry:
++  call void @llvm.loongarch.lsx.vstelm.h(<8 x i16> %va, i8* %p, i32 2, i32 7)
++  ret void
++}
++
++declare void @llvm.loongarch.lsx.vstelm.w(<4 x i32>, i8*, i32, i32)
++
++define void @lsx_vstelm_w(<4 x i32> %va, i8* %p) nounwind {
++; CHECK-LABEL: lsx_vstelm_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vstelm.w $vr0, $a0, 4, 3
++; CHECK-NEXT:    ret
++entry:
++  call void @llvm.loongarch.lsx.vstelm.w(<4 x i32> %va, i8* %p, i32 4, i32 3)
++  ret void
++}
++
++declare void @llvm.loongarch.lsx.vstelm.d(<2 x i64>, i8*, i32, i32)
++
++define void @lsx_vstelm_d(<2 x i64> %va, i8* %p) nounwind {
++; CHECK-LABEL: lsx_vstelm_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vstelm.d $vr0, $a0, 8, 1
++; CHECK-NEXT:    ret
++entry:
++  call void @llvm.loongarch.lsx.vstelm.d(<2 x i64> %va, i8* %p, i32 8, i32 1)
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sub.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sub.ll
+new file mode 100644
+index 000000000000..5c04a3d8de0d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sub.ll
+@@ -0,0 +1,62 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsub.b(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vsub_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsub_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsub.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsub.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsub.h(<8 x i16>, <8 x i16>)
++
++define <8 x i16> @lsx_vsub_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsub_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsub.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsub.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsub.w(<4 x i32>, <4 x i32>)
++
++define <4 x i32> @lsx_vsub_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsub_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsub.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsub.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsub.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vsub_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsub_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsub.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsub.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsub.q(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vsub_q(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsub_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsub.q $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsub.q(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-subi.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-subi.ll
+new file mode 100644
+index 000000000000..304a4e4a78cc
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-subi.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsubi.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsubi_bu(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vsubi_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubi.bu $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsubi.bu(<16 x i8> %va, i32 31)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsubi.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsubi_hu(<8 x i16> %va) nounwind {
++; CHECK-LABEL: lsx_vsubi_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubi.hu $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsubi.hu(<8 x i16> %va, i32 31)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsubi.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsubi_wu(<4 x i32> %va) nounwind {
++; CHECK-LABEL: lsx_vsubi_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubi.wu $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsubi.wu(<4 x i32> %va, i32 31)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsubi.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsubi_du(<2 x i64> %va) nounwind {
++; CHECK-LABEL: lsx_vsubi_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubi.du $vr0, $vr0, 31
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsubi.du(<2 x i64> %va, i32 31)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-subw.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-subw.ll
+new file mode 100644
+index 000000000000..48100db74334
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-subw.ll
+@@ -0,0 +1,194 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <8 x i16> @llvm.loongarch.lsx.vsubwev.h.b(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vsubwev_h_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsubwev_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubwev.h.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsubwev.w.h(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vsubwev_w_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsubwev_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubwev.w.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsubwev.d.w(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vsubwev_d_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsubwev_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubwev.d.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsubwev.q.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vsubwev_q_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsubwev_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubwev.q.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsubwev.h.bu(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vsubwev_h_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsubwev_h_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubwev.h.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsubwev.w.hu(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vsubwev_w_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsubwev_w_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubwev.w.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsubwev.d.wu(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vsubwev_d_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsubwev_d_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubwev.d.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsubwev.q.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vsubwev_q_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsubwev_q_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubwev.q.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsubwod.h.b(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vsubwod_h_b(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsubwod_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubwod.h.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.b(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsubwod.w.h(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vsubwod_w_h(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsubwod_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubwod.w.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.h(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsubwod.d.w(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vsubwod_d_w(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsubwod_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubwod.d.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.w(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsubwod.q.d(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vsubwod_q_d(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsubwod_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubwod.q.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.d(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsubwod.h.bu(<16 x i8>, <16 x i8>)
++
++define <8 x i16> @lsx_vsubwod_h_bu(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vsubwod_h_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubwod.h.bu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.bu(<16 x i8> %va, <16 x i8> %vb)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsubwod.w.hu(<8 x i16>, <8 x i16>)
++
++define <4 x i32> @lsx_vsubwod_w_hu(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK-LABEL: lsx_vsubwod_w_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubwod.w.hu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.hu(<8 x i16> %va, <8 x i16> %vb)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsubwod.d.wu(<4 x i32>, <4 x i32>)
++
++define <2 x i64> @lsx_vsubwod_d_wu(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK-LABEL: lsx_vsubwod_d_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubwod.d.wu $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.wu(<4 x i32> %va, <4 x i32> %vb)
++  ret <2 x i64> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsubwod.q.du(<2 x i64>, <2 x i64>)
++
++define <2 x i64> @lsx_vsubwod_q_du(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK-LABEL: lsx_vsubwod_q_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vsubwod.q.du $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.du(<2 x i64> %va, <2 x i64> %vb)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-xor.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-xor.ll
+new file mode 100644
+index 000000000000..72a1fe93c2c0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-xor.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vxor.v(<16 x i8>, <16 x i8>)
++
++define <16 x i8> @lsx_vxor_v(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK-LABEL: lsx_vxor_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vxor.v(<16 x i8> %va, <16 x i8> %vb)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-xori.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-xori.ll
+new file mode 100644
+index 000000000000..09669cd5ac14
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-xori.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vxori.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vxori_b(<16 x i8> %va) nounwind {
++; CHECK-LABEL: lsx_vxori_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vxori.b $vr0, $vr0, 3
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vxori.b(<16 x i8> %va, i32 3)
++  ret <16 x i8> %res
++}
+-- 
+2.20.1
+
+
+From fd469d4a3c3b439f40accda691597502bc444a99 Mon Sep 17 00:00:00 2001
+From: chenli <chenli@loongson.cn>
+Date: Sat, 19 Aug 2023 17:12:27 +0800
+Subject: [PATCH 05/35] [LoongArch] Add LASX intrinsic testcases
+
+Depends on D155830
+
+Reviewed By: SixWeining
+
+Differential Revision: https://reviews.llvm.org/D155835
+
+(cherry picked from commit 83311b2b5d1b9869f9a7b265994394ea898448a2)
+---
+ .../CodeGen/LoongArch/lasx/intrinsic-absd.ll  |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-add.ll   |  62 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-adda.ll  |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-addi.ll  |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-addw.ll  | 290 ++++++++++
+ .../CodeGen/LoongArch/lasx/intrinsic-and.ll   |  14 +
+ .../CodeGen/LoongArch/lasx/intrinsic-andi.ll  |  14 +
+ .../CodeGen/LoongArch/lasx/intrinsic-andn.ll  |  14 +
+ .../CodeGen/LoongArch/lasx/intrinsic-avg.ll   |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-avgr.ll  |  98 ++++
+ .../LoongArch/lasx/intrinsic-bitclr.ll        |  98 ++++
+ .../LoongArch/lasx/intrinsic-bitrev.ll        |  98 ++++
+ .../LoongArch/lasx/intrinsic-bitsel.ll        |  14 +
+ .../LoongArch/lasx/intrinsic-bitseli.ll       |  14 +
+ .../LoongArch/lasx/intrinsic-bitset.ll        |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-bsll.ll  |  14 +
+ .../CodeGen/LoongArch/lasx/intrinsic-bsrl.ll  |  14 +
+ .../CodeGen/LoongArch/lasx/intrinsic-clo.ll   |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-clz.ll   |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-div.ll   |  98 ++++
+ .../LoongArch/lasx/intrinsic-ext2xv.ll        | 146 +++++
+ .../CodeGen/LoongArch/lasx/intrinsic-exth.ll  |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-extl.ll  |  26 +
+ .../LoongArch/lasx/intrinsic-extrins.ll       |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-fadd.ll  |  26 +
+ .../LoongArch/lasx/intrinsic-fclass.ll        |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-fcmp.ll  | 530 ++++++++++++++++++
+ .../CodeGen/LoongArch/lasx/intrinsic-fcvt.ll  |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-fcvth.ll |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-fcvtl.ll |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-fdiv.ll  |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-ffint.ll |  86 +++
+ .../CodeGen/LoongArch/lasx/intrinsic-flogb.ll |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-fmadd.ll |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-fmax.ll  |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-fmaxa.ll |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-fmin.ll  |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-fmina.ll |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-fmsub.ll |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-fmul.ll  |  26 +
+ .../LoongArch/lasx/intrinsic-fnmadd.ll        |  26 +
+ .../LoongArch/lasx/intrinsic-fnmsub.ll        |  26 +
+ .../LoongArch/lasx/intrinsic-frecip.ll        |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-frint.ll | 122 ++++
+ .../LoongArch/lasx/intrinsic-frsqrt.ll        |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-frstp.ll |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-fsqrt.ll |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-fsub.ll  |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-ftint.ll | 350 ++++++++++++
+ .../CodeGen/LoongArch/lasx/intrinsic-haddw.ll |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-hsubw.ll |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-ilv.ll   |  98 ++++
+ .../LoongArch/lasx/intrinsic-insgr2vr.ll      |  28 +
+ .../LoongArch/lasx/intrinsic-insve0.ll        |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-ld.ll    |  26 +
+ .../CodeGen/LoongArch/lasx/intrinsic-ldi.ll   |  62 ++
+ .../LoongArch/lasx/intrinsic-ldrepl.ll        |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-madd.ll  |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-maddw.ll | 290 ++++++++++
+ .../CodeGen/LoongArch/lasx/intrinsic-max.ll   | 194 +++++++
+ .../CodeGen/LoongArch/lasx/intrinsic-min.ll   | 194 +++++++
+ .../CodeGen/LoongArch/lasx/intrinsic-mod.ll   |  98 ++++
+ .../LoongArch/lasx/intrinsic-mskgez.ll        |  14 +
+ .../LoongArch/lasx/intrinsic-mskltz.ll        |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-msknz.ll |  14 +
+ .../CodeGen/LoongArch/lasx/intrinsic-msub.ll  |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-muh.ll   |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-mul.ll   |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-mulw.ll  | 290 ++++++++++
+ .../CodeGen/LoongArch/lasx/intrinsic-neg.ll   |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-nor.ll   |  14 +
+ .../CodeGen/LoongArch/lasx/intrinsic-nori.ll  |  14 +
+ .../CodeGen/LoongArch/lasx/intrinsic-or.ll    |  14 +
+ .../CodeGen/LoongArch/lasx/intrinsic-ori.ll   |  14 +
+ .../CodeGen/LoongArch/lasx/intrinsic-orn.ll   |  14 +
+ .../CodeGen/LoongArch/lasx/intrinsic-pack.ll  |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-pcnt.ll  |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-perm.ll  |  14 +
+ .../CodeGen/LoongArch/lasx/intrinsic-permi.ll |  38 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-pick.ll  |  98 ++++
+ .../LoongArch/lasx/intrinsic-pickve.ll        |  50 ++
+ .../LoongArch/lasx/intrinsic-pickve2gr.ll     |  53 ++
+ .../LoongArch/lasx/intrinsic-repl128vei.ll    |  50 ++
+ .../LoongArch/lasx/intrinsic-replgr2vr.ll     |  50 ++
+ .../LoongArch/lasx/intrinsic-replve.ll        |  50 ++
+ .../LoongArch/lasx/intrinsic-replve0.ll       |  62 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-rotr.ll  |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-sadd.ll  |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-sat.ll   |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-seq.ll   |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-set.ll   |  38 ++
+ .../LoongArch/lasx/intrinsic-setallnez.ll     |  74 +++
+ .../LoongArch/lasx/intrinsic-setanyeqz.ll     |  74 +++
+ .../CodeGen/LoongArch/lasx/intrinsic-shuf.ll  |  50 ++
+ .../LoongArch/lasx/intrinsic-shuf4i.ll        |  50 ++
+ .../LoongArch/lasx/intrinsic-signcov.ll       |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-sle.ll   | 194 +++++++
+ .../CodeGen/LoongArch/lasx/intrinsic-sll.ll   |  98 ++++
+ .../LoongArch/lasx/intrinsic-sllwil.ll        |  74 +++
+ .../CodeGen/LoongArch/lasx/intrinsic-slt.ll   | 194 +++++++
+ .../CodeGen/LoongArch/lasx/intrinsic-sra.ll   |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-sran.ll  |  38 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-srani.ll |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-srar.ll  |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-srarn.ll |  38 ++
+ .../LoongArch/lasx/intrinsic-srarni.ll        |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-srl.ll   |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-srln.ll  |  38 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-srlni.ll |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-srlr.ll  |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-srlrn.ll |  38 ++
+ .../LoongArch/lasx/intrinsic-srlrni.ll        |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-ssran.ll |  74 +++
+ .../LoongArch/lasx/intrinsic-ssrani.ll        |  98 ++++
+ .../LoongArch/lasx/intrinsic-ssrarn.ll        |  74 +++
+ .../LoongArch/lasx/intrinsic-ssrarni.ll       |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-ssrln.ll |  74 +++
+ .../LoongArch/lasx/intrinsic-ssrlni.ll        |  98 ++++
+ .../LoongArch/lasx/intrinsic-ssrlrn.ll        |  74 +++
+ .../LoongArch/lasx/intrinsic-ssrlrni.ll       |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-ssub.ll  |  98 ++++
+ .../CodeGen/LoongArch/lasx/intrinsic-st.ll    |  27 +
+ .../CodeGen/LoongArch/lasx/intrinsic-stelm.ll |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-sub.ll   |  62 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-subi.ll  |  50 ++
+ .../CodeGen/LoongArch/lasx/intrinsic-subw.ll  | 194 +++++++
+ .../CodeGen/LoongArch/lasx/intrinsic-xor.ll   |  14 +
+ .../CodeGen/LoongArch/lasx/intrinsic-xori.ll  |  14 +
+ 128 files changed, 9154 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-absd.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-add.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-adda.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-addi.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-addw.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-and.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-andi.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-andn.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-avg.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-avgr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitclr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitrev.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitsel.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitseli.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitset.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsll.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsrl.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-clo.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-clz.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-div.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ext2xv.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-exth.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-extl.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-extrins.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fadd.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fclass.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fcmp.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fcvt.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fcvth.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fcvtl.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fdiv.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ffint.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-flogb.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmadd.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmax.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmaxa.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmin.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmina.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmsub.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmul.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fnmadd.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fnmsub.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecip.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-frint.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrt.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-frstp.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fsqrt.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-fsub.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ftint.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-haddw.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-hsubw.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ilv.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-insgr2vr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-insve0.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ld.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldi.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldrepl.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-madd.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-maddw.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-max.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-min.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-mod.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-mskgez.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-mskltz.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-msknz.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-msub.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-muh.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-mul.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-mulw.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-neg.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-nor.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-nori.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-or.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ori.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-orn.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-pack.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-pcnt.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-perm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-pick.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl128vei.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-replve.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-replve0.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-rotr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sadd.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sat.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-seq.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-set.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-setallnez.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-setanyeqz.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-shuf.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-shuf4i.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-signcov.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sle.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sll.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sllwil.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-slt.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sra.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sran.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srani.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srar.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srarn.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srarni.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srl.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srln.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlni.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlrn.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlrni.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssran.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrani.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrarn.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrarni.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrln.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlni.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlrn.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlrni.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssub.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-st.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-stelm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sub.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-subi.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-subw.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-xor.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-xori.ll
+
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-absd.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-absd.ll
+new file mode 100644
+index 000000000000..bf54f44357b0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-absd.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvabsd.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvabsd_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvabsd_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvabsd.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvabsd.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvabsd.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvabsd_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvabsd_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvabsd.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvabsd.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvabsd.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvabsd_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvabsd_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvabsd.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvabsd.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvabsd.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvabsd_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvabsd_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvabsd.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvabsd.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvabsd.bu(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvabsd_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvabsd_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvabsd.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvabsd.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvabsd.hu(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvabsd_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvabsd_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvabsd.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvabsd.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvabsd.wu(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvabsd_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvabsd_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvabsd.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvabsd.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvabsd.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvabsd_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvabsd_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvabsd.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvabsd.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-add.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-add.ll
+new file mode 100644
+index 000000000000..0c2f2ace29fc
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-add.ll
+@@ -0,0 +1,62 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvadd.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvadd_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvadd_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvadd.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvadd.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvadd.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvadd_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvadd_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvadd.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvadd.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvadd.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvadd_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvadd_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvadd.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvadd.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvadd.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvadd_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvadd_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvadd.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvadd.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvadd.q(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvadd_q(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvadd_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvadd.q $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvadd.q(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-adda.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-adda.ll
+new file mode 100644
+index 000000000000..c1258d53e913
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-adda.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvadda.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvadda_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvadda_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvadda.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvadda.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvadda.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvadda_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvadda_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvadda.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvadda.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvadda.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvadda_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvadda_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvadda.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvadda.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvadda.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvadda_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvadda_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvadda.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvadda.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-addi.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-addi.ll
+new file mode 100644
+index 000000000000..09b5d07a0151
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-addi.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvaddi_bu(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvaddi_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddi.bu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvaddi_hu(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvaddi_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddi.hu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvaddi_wu(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvaddi_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddi.wu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvaddi_du(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvaddi_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddi.du $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-addw.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-addw.ll
+new file mode 100644
+index 000000000000..ef7a1b5a50ef
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-addw.ll
+@@ -0,0 +1,290 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.b(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvaddwev_h_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwev_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwev.h.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.h(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvaddwev_w_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwev_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwev.w.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.w(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvaddwev_d_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwev_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwev.d.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvaddwev_q_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwev_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwev.q.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvaddwev_h_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwev_h_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwev.h.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvaddwev_w_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwev_w_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwev.w.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvaddwev_d_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwev_d_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwev.d.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvaddwev_q_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwev_q_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwev.q.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu.b(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvaddwev_h_bu_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwev_h_bu_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwev.h.bu.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu.h(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvaddwev_w_hu_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwev_w_hu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwev.w.hu.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu.w(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvaddwev_d_wu_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwev_d_wu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwev.d.wu.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvaddwev_q_du_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwev_q_du_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwev.q.du.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.b(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvaddwod_h_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwod_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwod.h.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.h(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvaddwod_w_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwod_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwod.w.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.w(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvaddwod_d_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwod_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwod.d.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvaddwod_q_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwod_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwod.q.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvaddwod_h_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwod_h_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwod.h.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvaddwod_w_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwod_w_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwod.w.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvaddwod_d_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwod_d_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwod.d.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvaddwod_q_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwod_q_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwod.q.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu.b(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvaddwod_h_bu_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwod_h_bu_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwod.h.bu.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu.h(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvaddwod_w_hu_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwod_w_hu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwod.w.hu.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu.w(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvaddwod_d_wu_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwod_d_wu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwod.d.wu.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvaddwod_q_du_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvaddwod_q_du_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvaddwod.q.du.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-and.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-and.ll
+new file mode 100644
+index 000000000000..15f3a8094770
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-and.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvand.v(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvand_v(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvand_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvand.v(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-andi.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-andi.ll
+new file mode 100644
+index 000000000000..88cf142d6968
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-andi.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvandi_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvandi_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvandi.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-andn.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-andn.ll
+new file mode 100644
+index 000000000000..f385ef3661cb
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-andn.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvandn.v(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvandn_v(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvandn_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvandn.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvandn.v(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-avg.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-avg.ll
+new file mode 100644
+index 000000000000..488d3b96b003
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-avg.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvavg.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvavg_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvavg_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvavg.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvavg.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvavg.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvavg_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvavg_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvavg.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvavg.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvavg.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvavg_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvavg_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvavg.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvavg.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvavg.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvavg_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvavg_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvavg.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvavg.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvavg.bu(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvavg_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvavg_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvavg.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvavg.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvavg.hu(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvavg_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvavg_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvavg.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvavg.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvavg.wu(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvavg_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvavg_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvavg.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvavg.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvavg.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvavg_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvavg_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvavg.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvavg.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-avgr.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-avgr.ll
+new file mode 100644
+index 000000000000..b5ab5a5366aa
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-avgr.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvavgr.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvavgr_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvavgr_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvavgr.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvavgr.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvavgr.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvavgr_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvavgr_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvavgr.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvavgr.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvavgr.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvavgr_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvavgr_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvavgr.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvavgr.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvavgr.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvavgr_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvavgr_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvavgr.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvavgr.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvavgr.bu(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvavgr_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvavgr_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvavgr.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvavgr.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvavgr.hu(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvavgr_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvavgr_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvavgr.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvavgr.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvavgr.wu(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvavgr_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvavgr_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvavgr.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvavgr.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvavgr.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvavgr_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvavgr_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvavgr.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvavgr.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitclr.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitclr.ll
+new file mode 100644
+index 000000000000..cec71bab2fe8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitclr.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbitclr.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvbitclr_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvbitclr_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitclr.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitclr.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvbitclr.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvbitclr_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvbitclr_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitclr.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvbitclr.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvbitclr.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvbitclr_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvbitclr_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitclr.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvbitclr.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvbitclr.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvbitclr_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvbitclr_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitclr.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvbitclr.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbitclri_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvbitclri_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitclri.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvbitclri_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvbitclri_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitclri.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvbitclri_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvbitclri_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitclri.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvbitclri_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvbitclri_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitclri.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitrev.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitrev.ll
+new file mode 100644
+index 000000000000..fb4f9fbc2e4b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitrev.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbitrev.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvbitrev_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvbitrev_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitrev.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitrev.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvbitrev.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvbitrev_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvbitrev_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitrev.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvbitrev.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvbitrev.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvbitrev_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvbitrev_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitrev.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvbitrev.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvbitrev.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvbitrev_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvbitrev_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitrev.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvbitrev.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbitrevi_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvbitrevi_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitrevi.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvbitrevi_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvbitrevi_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitrevi.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvbitrevi_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvbitrevi_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitrevi.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvbitrevi_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvbitrevi_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitrevi.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitsel.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitsel.ll
+new file mode 100644
+index 000000000000..2e91407590ac
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitsel.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbitsel.v(<32 x i8>, <32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvbitsel_v(<32 x i8> %va, <32 x i8> %vb, <32 x i8> %vc) nounwind {
++; CHECK-LABEL: lasx_xvbitsel_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitsel.v $xr0, $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitsel.v(<32 x i8> %va, <32 x i8> %vb, <32 x i8> %vc)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitseli.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitseli.ll
+new file mode 100644
+index 000000000000..79dd55cbfef9
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitseli.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbitseli_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvbitseli_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitseli.b $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> %va, <32 x i8> %vb, i32 1)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitset.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitset.ll
+new file mode 100644
+index 000000000000..83d1f0ef60c6
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitset.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbitset.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvbitset_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvbitset_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitset.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitset.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvbitset.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvbitset_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvbitset_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitset.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvbitset.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvbitset.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvbitset_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvbitset_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitset.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvbitset.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvbitset.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvbitset_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvbitset_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitset.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvbitset.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbitseti_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvbitseti_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitseti.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvbitseti_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvbitseti_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitseti.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvbitseti_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvbitseti_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitseti.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvbitseti_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvbitseti_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbitseti.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsll.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsll.ll
+new file mode 100644
+index 000000000000..cbb63ced5cc0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsll.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbsll_v(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvbsll_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbsll.v $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsrl.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsrl.ll
+new file mode 100644
+index 000000000000..b0c26cbe3e35
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsrl.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbsrl_v(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvbsrl_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvbsrl.v $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-clo.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-clo.ll
+new file mode 100644
+index 000000000000..29b2be03d54e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-clo.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvclo.b(<32 x i8>)
++
++define <32 x i8> @lasx_xvclo_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvclo_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvclo.b $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvclo.b(<32 x i8> %va)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvclo.h(<16 x i16>)
++
++define <16 x i16> @lasx_xvclo_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvclo_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvclo.h $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvclo.h(<16 x i16> %va)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvclo.w(<8 x i32>)
++
++define <8 x i32> @lasx_xvclo_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvclo_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvclo.w $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvclo.w(<8 x i32> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvclo.d(<4 x i64>)
++
++define <4 x i64> @lasx_xvclo_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvclo_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvclo.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvclo.d(<4 x i64> %va)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-clz.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-clz.ll
+new file mode 100644
+index 000000000000..5247ceedbd14
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-clz.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvclz.b(<32 x i8>)
++
++define <32 x i8> @lasx_xvclz_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvclz_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvclz.b $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvclz.b(<32 x i8> %va)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvclz.h(<16 x i16>)
++
++define <16 x i16> @lasx_xvclz_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvclz_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvclz.h $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvclz.h(<16 x i16> %va)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvclz.w(<8 x i32>)
++
++define <8 x i32> @lasx_xvclz_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvclz_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvclz.w $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvclz.w(<8 x i32> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvclz.d(<4 x i64>)
++
++define <4 x i64> @lasx_xvclz_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvclz_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvclz.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvclz.d(<4 x i64> %va)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-div.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-div.ll
+new file mode 100644
+index 000000000000..813204092e94
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-div.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvdiv.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvdiv_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvdiv_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvdiv.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvdiv.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvdiv.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvdiv_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvdiv_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvdiv.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvdiv.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvdiv.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvdiv_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvdiv_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvdiv.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvdiv.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvdiv.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvdiv_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvdiv_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvdiv.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvdiv.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvdiv.bu(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvdiv_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvdiv_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvdiv.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvdiv.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvdiv.hu(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvdiv_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvdiv_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvdiv.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvdiv.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvdiv.wu(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvdiv_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvdiv_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvdiv.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvdiv.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvdiv.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvdiv_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvdiv_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvdiv.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvdiv.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ext2xv.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ext2xv.ll
+new file mode 100644
+index 000000000000..48721b52af00
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ext2xv.ll
+@@ -0,0 +1,146 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <16 x i16> @llvm.loongarch.lasx.vext2xv.h.b(<32 x i8>)
++
++define <16 x i16> @lasx_vext2xv_h_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_vext2xv_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vext2xv.h.b $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.vext2xv.h.b(<32 x i8> %va)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.vext2xv.w.b(<32 x i8>)
++
++define <8 x i32> @lasx_vext2xv_w_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_vext2xv_w_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vext2xv.w.b $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.b(<32 x i8> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.vext2xv.d.b(<32 x i8>)
++
++define <4 x i64> @lasx_vext2xv_d_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_vext2xv_d_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vext2xv.d.b $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.b(<32 x i8> %va)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.vext2xv.w.h(<16 x i16>)
++
++define <8 x i32> @lasx_vext2xv_w_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_vext2xv_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vext2xv.w.h $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.h(<16 x i16> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.vext2xv.d.h(<16 x i16>)
++
++define <4 x i64> @lasx_vext2xv_d_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_vext2xv_d_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vext2xv.d.h $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.h(<16 x i16> %va)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.vext2xv.d.w(<8 x i32>)
++
++define <4 x i64> @lasx_vext2xv_d_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_vext2xv_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vext2xv.d.w $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.w(<8 x i32> %va)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.vext2xv.hu.bu(<32 x i8>)
++
++define <16 x i16> @lasx_vext2xv_hu_bu(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_vext2xv_hu_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vext2xv.hu.bu $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.vext2xv.hu.bu(<32 x i8> %va)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.bu(<32 x i8>)
++
++define <8 x i32> @lasx_vext2xv_wu_bu(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_vext2xv_wu_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vext2xv.wu.bu $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.bu(<32 x i8> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.vext2xv.du.bu(<32 x i8>)
++
++define <4 x i64> @lasx_vext2xv_du_bu(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_vext2xv_du_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vext2xv.du.bu $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.bu(<32 x i8> %va)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.hu(<16 x i16>)
++
++define <8 x i32> @lasx_vext2xv_wu_hu(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_vext2xv_wu_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vext2xv.wu.hu $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.hu(<16 x i16> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.vext2xv.du.hu(<16 x i16>)
++
++define <4 x i64> @lasx_vext2xv_du_hu(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_vext2xv_du_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vext2xv.du.hu $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.hu(<16 x i16> %va)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.vext2xv.du.wu(<8 x i32>)
++
++define <4 x i64> @lasx_vext2xv_du_wu(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_vext2xv_du_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vext2xv.du.wu $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.wu(<8 x i32> %va)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-exth.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-exth.ll
+new file mode 100644
+index 000000000000..543589e61b12
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-exth.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <16 x i16> @llvm.loongarch.lasx.xvexth.h.b(<32 x i8>)
++
++define <16 x i16> @lasx_xvexth_h_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvexth_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvexth.h.b $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvexth.h.b(<32 x i8> %va)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvexth.w.h(<16 x i16>)
++
++define <8 x i32> @lasx_xvexth_w_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvexth_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvexth.w.h $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvexth.w.h(<16 x i16> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvexth.d.w(<8 x i32>)
++
++define <4 x i64> @lasx_xvexth_d_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvexth_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvexth.d.w $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvexth.d.w(<8 x i32> %va)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvexth.q.d(<4 x i64>)
++
++define <4 x i64> @lasx_xvexth_q_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvexth_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvexth.q.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvexth.q.d(<4 x i64> %va)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvexth.hu.bu(<32 x i8>)
++
++define <16 x i16> @lasx_xvexth_hu_bu(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvexth_hu_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvexth.hu.bu $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvexth.hu.bu(<32 x i8> %va)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvexth.wu.hu(<16 x i16>)
++
++define <8 x i32> @lasx_xvexth_wu_hu(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvexth_wu_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvexth.wu.hu $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvexth.wu.hu(<16 x i16> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvexth.du.wu(<8 x i32>)
++
++define <4 x i64> @lasx_xvexth_du_wu(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvexth_du_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvexth.du.wu $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvexth.du.wu(<8 x i32> %va)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvexth.qu.du(<4 x i64>)
++
++define <4 x i64> @lasx_xvexth_qu_du(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvexth_qu_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvexth.qu.du $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvexth.qu.du(<4 x i64> %va)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-extl.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-extl.ll
+new file mode 100644
+index 000000000000..7040c8c784cd
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-extl.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <4 x i64> @llvm.loongarch.lasx.xvextl.q.d(<4 x i64>)
++
++define <4 x i64> @lasx_xvextl_q_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvextl_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvextl.q.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvextl.q.d(<4 x i64> %va)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvextl.qu.du(<4 x i64>)
++
++define <4 x i64> @lasx_xvextl_qu_du(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvextl_qu_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvextl.qu.du $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvextl.qu.du(<4 x i64> %va)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-extrins.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-extrins.ll
+new file mode 100644
+index 000000000000..c8774a7b29c0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-extrins.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvextrins_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvextrins_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvextrins.b $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> %va, <32 x i8> %vb, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvextrins_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvextrins_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvextrins.h $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> %va, <16 x i16> %vb, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvextrins_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvextrins_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvextrins.w $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> %va, <8 x i32> %vb, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvextrins_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvextrins_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvextrins.d $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> %va, <4 x i64> %vb, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fadd.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fadd.ll
+new file mode 100644
+index 000000000000..563a0ce9e384
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fadd.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfadd.s(<8 x float>, <8 x float>)
++
++define <8 x float> @lasx_xvfadd_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfadd_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfadd.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfadd.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfadd.d(<4 x double>, <4 x double>)
++
++define <4 x double> @lasx_xvfadd_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfadd_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfadd.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfadd.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fclass.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fclass.ll
+new file mode 100644
+index 000000000000..901ca5bb0260
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fclass.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfclass.s(<8 x float>)
++
++define <8 x i32> @lasx_xvfclass_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvfclass_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfclass.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfclass.s(<8 x float> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfclass.d(<4 x double>)
++
++define <4 x i64> @lasx_xvfclass_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvfclass_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfclass.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfclass.d(<4 x double> %va)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fcmp.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fcmp.ll
+new file mode 100644
+index 000000000000..b01f908e71af
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fcmp.ll
+@@ -0,0 +1,530 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.caf.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_caf_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_caf_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.caf.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.caf.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.caf.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_caf_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_caf_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.caf.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.caf.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.cun.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_cun_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_cun_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.cun.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cun.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.cun.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_cun_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_cun_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.cun.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cun.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.ceq.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_ceq_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_ceq_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.ceq.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.ceq.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.ceq.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_ceq_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_ceq_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.ceq.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.ceq.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.cueq.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_cueq_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_cueq_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.cueq.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cueq.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.cueq.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_cueq_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_cueq_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.cueq.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cueq.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.clt.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_clt_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_clt_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.clt.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.clt.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.clt.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_clt_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_clt_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.clt.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.clt.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.cult.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_cult_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_cult_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.cult.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cult.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.cult.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_cult_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_cult_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.cult.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cult.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.cle.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_cle_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_cle_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.cle.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cle.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.cle.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_cle_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_cle_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.cle.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cle.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.cule.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_cule_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_cule_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.cule.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cule.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.cule.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_cule_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_cule_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.cule.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cule.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.cne.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_cne_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_cne_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.cne.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cne.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.cne.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_cne_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_cne_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.cne.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cne.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.cor.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_cor_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_cor_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.cor.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cor.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.cor.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_cor_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_cor_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.cor.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cor.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.cune.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_cune_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_cune_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.cune.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cune.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.cune.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_cune_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_cune_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.cune.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cune.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.saf.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_saf_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_saf_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.saf.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.saf.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.saf.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_saf_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_saf_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.saf.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.saf.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.sun.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_sun_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_sun_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.sun.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sun.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.sun.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_sun_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_sun_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.sun.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sun.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.seq.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_seq_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_seq_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.seq.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.seq.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.seq.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_seq_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_seq_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.seq.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.seq.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.sueq.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_sueq_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_sueq_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.sueq.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sueq.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.sueq.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_sueq_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_sueq_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.sueq.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sueq.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.slt.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_slt_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_slt_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.slt.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.slt.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.slt.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_slt_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_slt_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.slt.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.slt.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.sult.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_sult_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_sult_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.sult.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sult.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.sult.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_sult_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_sult_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.sult.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sult.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.sle.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_sle_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_sle_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.sle.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sle.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.sle.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_sle_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_sle_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.sle.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sle.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.sule.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_sule_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_sule_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.sule.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sule.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.sule.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_sule_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_sule_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.sule.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sule.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.sne.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_sne_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_sne_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.sne.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sne.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.sne.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_sne_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_sne_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.sne.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sne.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.sor.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_sor_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_sor_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.sor.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sor.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.sor.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_sor_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_sor_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.sor.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sor.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvfcmp.sune.s(<8 x float>, <8 x float>)
++
++define <8 x i32> @lasx_xvfcmp_sune_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_sune_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.sune.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sune.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvfcmp.sune.d(<4 x double>, <4 x double>)
++
++define <4 x i64> @lasx_xvfcmp_sune_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcmp_sune_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcmp.sune.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sune.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fcvt.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fcvt.ll
+new file mode 100644
+index 000000000000..82bf1d3df72c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fcvt.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <16 x i16> @llvm.loongarch.lasx.xvfcvt.h.s(<8 x float>, <8 x float>)
++
++define <16 x i16> @lasx_xvfcvt_h_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcvt_h_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcvt.h.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvfcvt.h.s(<8 x float> %va, <8 x float> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x float> @llvm.loongarch.lasx.xvfcvt.s.d(<4 x double>, <4 x double>)
++
++define <8 x float> @lasx_xvfcvt_s_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfcvt_s_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcvt.s.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfcvt.s.d(<4 x double> %va, <4 x double> %vb)
++  ret <8 x float> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fcvth.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fcvth.ll
+new file mode 100644
+index 000000000000..e1a6a2923e67
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fcvth.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfcvth.s.h(<16 x i16>)
++
++define <8 x float> @lasx_xvfcvth_s_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvfcvth_s_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcvth.s.h $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfcvth.s.h(<16 x i16> %va)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfcvth.d.s(<8 x float>)
++
++define <4 x double> @lasx_xvfcvth_d_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvfcvth_d_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcvth.d.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfcvth.d.s(<8 x float> %va)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fcvtl.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fcvtl.ll
+new file mode 100644
+index 000000000000..0b3e693c7f51
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fcvtl.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfcvtl.s.h(<16 x i16>)
++
++define <8 x float> @lasx_xvfcvtl_s_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvfcvtl_s_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcvtl.s.h $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfcvtl.s.h(<16 x i16> %va)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfcvtl.d.s(<8 x float>)
++
++define <4 x double> @lasx_xvfcvtl_d_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvfcvtl_d_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfcvtl.d.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfcvtl.d.s(<8 x float> %va)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fdiv.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fdiv.ll
+new file mode 100644
+index 000000000000..49923ddd4e8d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fdiv.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfdiv.s(<8 x float>, <8 x float>)
++
++define <8 x float> @lasx_xvfdiv_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfdiv_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfdiv.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfdiv.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfdiv.d(<4 x double>, <4 x double>)
++
++define <4 x double> @lasx_xvfdiv_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfdiv_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfdiv.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfdiv.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ffint.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ffint.ll
+new file mode 100644
+index 000000000000..24da0bd33838
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ffint.ll
+@@ -0,0 +1,86 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvffint.s.w(<8 x i32>)
++
++define <8 x float> @lasx_xvffint_s_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvffint_s_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvffint.s.w $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvffint.s.w(<8 x i32> %va)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvffint.d.l(<4 x i64>)
++
++define <4 x double> @lasx_xvffint_d_l(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvffint_d_l:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvffint.d.l $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvffint.d.l(<4 x i64> %va)
++  ret <4 x double> %res
++}
++
++declare <8 x float> @llvm.loongarch.lasx.xvffint.s.wu(<8 x i32>)
++
++define <8 x float> @lasx_xvffint_s_wu(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvffint_s_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvffint.s.wu $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvffint.s.wu(<8 x i32> %va)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvffint.d.lu(<4 x i64>)
++
++define <4 x double> @lasx_xvffint_d_lu(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvffint_d_lu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvffint.d.lu $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvffint.d.lu(<4 x i64> %va)
++  ret <4 x double> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvffintl.d.w(<8 x i32>)
++
++define <4 x double> @lasx_xvffintl_d_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvffintl_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvffintl.d.w $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvffintl.d.w(<8 x i32> %va)
++  ret <4 x double> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvffinth.d.w(<8 x i32>)
++
++define <4 x double> @lasx_xvffinth_d_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvffinth_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvffinth.d.w $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvffinth.d.w(<8 x i32> %va)
++  ret <4 x double> %res
++}
++
++declare <8 x float> @llvm.loongarch.lasx.xvffint.s.l(<4 x i64>, <4 x i64>)
++
++define <8 x float> @lasx_xvffint_s_l(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvffint_s_l:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvffint.s.l $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvffint.s.l(<4 x i64> %va, <4 x i64> %vb)
++  ret <8 x float> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-flogb.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-flogb.ll
+new file mode 100644
+index 000000000000..bccef4504d70
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-flogb.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvflogb.s(<8 x float>)
++
++define <8 x float> @lasx_xvflogb_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvflogb_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvflogb.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvflogb.s(<8 x float> %va)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvflogb.d(<4 x double>)
++
++define <4 x double> @lasx_xvflogb_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvflogb_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvflogb.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvflogb.d(<4 x double> %va)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmadd.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmadd.ll
+new file mode 100644
+index 000000000000..0fc06f971660
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmadd.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfmadd.s(<8 x float>, <8 x float>, <8 x float>)
++
++define <8 x float> @lasx_xvfmadd_s(<8 x float> %va, <8 x float> %vb, <8 x float> %vc) nounwind {
++; CHECK-LABEL: lasx_xvfmadd_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfmadd.s $xr0, $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfmadd.s(<8 x float> %va, <8 x float> %vb, <8 x float> %vc)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfmadd.d(<4 x double>, <4 x double>, <4 x double>)
++
++define <4 x double> @lasx_xvfmadd_d(<4 x double> %va, <4 x double> %vb, <4 x double> %vc) nounwind {
++; CHECK-LABEL: lasx_xvfmadd_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfmadd.d $xr0, $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfmadd.d(<4 x double> %va, <4 x double> %vb, <4 x double> %vc)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmax.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmax.ll
+new file mode 100644
+index 000000000000..2422fa0c00d8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmax.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfmax.s(<8 x float>, <8 x float>)
++
++define <8 x float> @lasx_xvfmax_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfmax_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfmax.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfmax.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfmax.d(<4 x double>, <4 x double>)
++
++define <4 x double> @lasx_xvfmax_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfmax_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfmax.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfmax.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmaxa.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmaxa.ll
+new file mode 100644
+index 000000000000..cd9ccc656aef
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmaxa.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfmaxa.s(<8 x float>, <8 x float>)
++
++define <8 x float> @lasx_xvfmaxa_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfmaxa_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfmaxa.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfmaxa.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfmaxa.d(<4 x double>, <4 x double>)
++
++define <4 x double> @lasx_xvfmaxa_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfmaxa_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfmaxa.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfmaxa.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmin.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmin.ll
+new file mode 100644
+index 000000000000..effb3f9e1d75
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmin.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfmin.s(<8 x float>, <8 x float>)
++
++define <8 x float> @lasx_xvfmin_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfmin_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfmin.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfmin.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfmin.d(<4 x double>, <4 x double>)
++
++define <4 x double> @lasx_xvfmin_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfmin_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfmin.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfmin.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmina.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmina.ll
+new file mode 100644
+index 000000000000..753a6f31ba06
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmina.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfmina.s(<8 x float>, <8 x float>)
++
++define <8 x float> @lasx_xvfmina_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfmina_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfmina.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfmina.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfmina.d(<4 x double>, <4 x double>)
++
++define <4 x double> @lasx_xvfmina_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfmina_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfmina.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfmina.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmsub.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmsub.ll
+new file mode 100644
+index 000000000000..57909d0dd168
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmsub.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfmsub.s(<8 x float>, <8 x float>, <8 x float>)
++
++define <8 x float> @lasx_xvfmsub_s(<8 x float> %va, <8 x float> %vb, <8 x float> %vc) nounwind {
++; CHECK-LABEL: lasx_xvfmsub_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfmsub.s $xr0, $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfmsub.s(<8 x float> %va, <8 x float> %vb, <8 x float> %vc)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfmsub.d(<4 x double>, <4 x double>, <4 x double>)
++
++define <4 x double> @lasx_xvfmsub_d(<4 x double> %va, <4 x double> %vb, <4 x double> %vc) nounwind {
++; CHECK-LABEL: lasx_xvfmsub_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfmsub.d $xr0, $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfmsub.d(<4 x double> %va, <4 x double> %vb, <4 x double> %vc)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmul.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmul.ll
+new file mode 100644
+index 000000000000..9cad6f383066
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fmul.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfmul.s(<8 x float>, <8 x float>)
++
++define <8 x float> @lasx_xvfmul_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfmul_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfmul.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfmul.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfmul.d(<4 x double>, <4 x double>)
++
++define <4 x double> @lasx_xvfmul_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfmul_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfmul.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfmul.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fnmadd.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fnmadd.ll
+new file mode 100644
+index 000000000000..c30993590f98
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fnmadd.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfnmadd.s(<8 x float>, <8 x float>, <8 x float>)
++
++define <8 x float> @lasx_xvfnmadd_s(<8 x float> %va, <8 x float> %vb, <8 x float> %vc) nounwind {
++; CHECK-LABEL: lasx_xvfnmadd_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfnmadd.s $xr0, $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfnmadd.s(<8 x float> %va, <8 x float> %vb, <8 x float> %vc)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfnmadd.d(<4 x double>, <4 x double>, <4 x double>)
++
++define <4 x double> @lasx_xvfnmadd_d(<4 x double> %va, <4 x double> %vb, <4 x double> %vc) nounwind {
++; CHECK-LABEL: lasx_xvfnmadd_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfnmadd.d $xr0, $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfnmadd.d(<4 x double> %va, <4 x double> %vb, <4 x double> %vc)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fnmsub.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fnmsub.ll
+new file mode 100644
+index 000000000000..2e7ca695be62
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fnmsub.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfnmsub.s(<8 x float>, <8 x float>, <8 x float>)
++
++define <8 x float> @lasx_xvfnmsub_s(<8 x float> %va, <8 x float> %vb, <8 x float> %vc) nounwind {
++; CHECK-LABEL: lasx_xvfnmsub_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfnmsub.s $xr0, $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfnmsub.s(<8 x float> %va, <8 x float> %vb, <8 x float> %vc)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfnmsub.d(<4 x double>, <4 x double>, <4 x double>)
++
++define <4 x double> @lasx_xvfnmsub_d(<4 x double> %va, <4 x double> %vb, <4 x double> %vc) nounwind {
++; CHECK-LABEL: lasx_xvfnmsub_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfnmsub.d $xr0, $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfnmsub.d(<4 x double> %va, <4 x double> %vb, <4 x double> %vc)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecip.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecip.ll
+new file mode 100644
+index 000000000000..da3a26df2824
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecip.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfrecip.s(<8 x float>)
++
++define <8 x float> @lasx_xvfrecip_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrecip_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrecip.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfrecip.s(<8 x float> %va)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfrecip.d(<4 x double>)
++
++define <4 x double> @lasx_xvfrecip_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrecip_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrecip.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfrecip.d(<4 x double> %va)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frint.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frint.ll
+new file mode 100644
+index 000000000000..ddead27cd14b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frint.ll
+@@ -0,0 +1,122 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfrintrne.s(<8 x float>)
++
++define <8 x float> @lasx_xvfrintrne_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrintrne_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrintrne.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfrintrne.s(<8 x float> %va)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfrintrne.d(<4 x double>)
++
++define <4 x double> @lasx_xvfrintrne_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrintrne_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrintrne.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfrintrne.d(<4 x double> %va)
++  ret <4 x double> %res
++}
++
++declare <8 x float> @llvm.loongarch.lasx.xvfrintrz.s(<8 x float>)
++
++define <8 x float> @lasx_xvfrintrz_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrintrz_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrintrz.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfrintrz.s(<8 x float> %va)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfrintrz.d(<4 x double>)
++
++define <4 x double> @lasx_xvfrintrz_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrintrz_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrintrz.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfrintrz.d(<4 x double> %va)
++  ret <4 x double> %res
++}
++
++declare <8 x float> @llvm.loongarch.lasx.xvfrintrp.s(<8 x float>)
++
++define <8 x float> @lasx_xvfrintrp_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrintrp_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrintrp.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfrintrp.s(<8 x float> %va)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfrintrp.d(<4 x double>)
++
++define <4 x double> @lasx_xvfrintrp_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrintrp_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrintrp.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfrintrp.d(<4 x double> %va)
++  ret <4 x double> %res
++}
++
++declare <8 x float> @llvm.loongarch.lasx.xvfrintrm.s(<8 x float>)
++
++define <8 x float> @lasx_xvfrintrm_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrintrm_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrintrm.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfrintrm.s(<8 x float> %va)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfrintrm.d(<4 x double>)
++
++define <4 x double> @lasx_xvfrintrm_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrintrm_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrintrm.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfrintrm.d(<4 x double> %va)
++  ret <4 x double> %res
++}
++
++declare <8 x float> @llvm.loongarch.lasx.xvfrint.s(<8 x float>)
++
++define <8 x float> @lasx_xvfrint_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrint_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrint.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfrint.s(<8 x float> %va)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfrint.d(<4 x double>)
++
++define <4 x double> @lasx_xvfrint_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrint_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrint.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfrint.d(<4 x double> %va)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrt.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrt.ll
+new file mode 100644
+index 000000000000..6efa8122baf1
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrt.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfrsqrt.s(<8 x float>)
++
++define <8 x float> @lasx_xvfrsqrt_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrsqrt_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrsqrt.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfrsqrt.s(<8 x float> %va)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfrsqrt.d(<4 x double>)
++
++define <4 x double> @lasx_xvfrsqrt_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrsqrt_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrsqrt.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfrsqrt.d(<4 x double> %va)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frstp.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frstp.ll
+new file mode 100644
+index 000000000000..e83e55a52a11
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frstp.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvfrstp.b(<32 x i8>, <32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvfrstp_b(<32 x i8> %va, <32 x i8> %vb, <32 x i8> %vc) nounwind {
++; CHECK-LABEL: lasx_xvfrstp_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrstp.b $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvfrstp.b(<32 x i8> %va, <32 x i8> %vb, <32 x i8> %vc)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvfrstp.h(<16 x i16>, <16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvfrstp_h(<16 x i16> %va, <16 x i16> %vb, <16 x i16> %vc) nounwind {
++; CHECK-LABEL: lasx_xvfrstp_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrstp.h $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvfrstp.h(<16 x i16> %va, <16 x i16> %vb, <16 x i16> %vc)
++  ret <16 x i16> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvfrstpi_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfrstpi_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrstpi.b $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> %va, <32 x i8> %vb, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvfrstpi_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfrstpi_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrstpi.h $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> %va, <16 x i16> %vb, i32 1)
++  ret <16 x i16> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fsqrt.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fsqrt.ll
+new file mode 100644
+index 000000000000..a13333d8d81c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fsqrt.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfsqrt.s(<8 x float>)
++
++define <8 x float> @lasx_xvfsqrt_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvfsqrt_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfsqrt.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfsqrt.s(<8 x float> %va)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfsqrt.d(<4 x double>)
++
++define <4 x double> @lasx_xvfsqrt_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvfsqrt_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfsqrt.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfsqrt.d(<4 x double> %va)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fsub.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fsub.ll
+new file mode 100644
+index 000000000000..b52774a03618
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-fsub.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfsub.s(<8 x float>, <8 x float>)
++
++define <8 x float> @lasx_xvfsub_s(<8 x float> %va, <8 x float> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfsub_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfsub.s $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfsub.s(<8 x float> %va, <8 x float> %vb)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfsub.d(<4 x double>, <4 x double>)
++
++define <4 x double> @lasx_xvfsub_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvfsub_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfsub.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfsub.d(<4 x double> %va, <4 x double> %vb)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ftint.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ftint.ll
+new file mode 100644
+index 000000000000..74cd507f16d2
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ftint.ll
+@@ -0,0 +1,350 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.s(<8 x float>)
++
++define <8 x i32> @lasx_xvftintrne_w_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrne_w_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrne.w.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.s(<8 x float> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvftintrne.l.d(<4 x double>)
++
++define <4 x i64> @lasx_xvftintrne_l_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrne_l_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrne.l.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvftintrne.l.d(<4 x double> %va)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.s(<8 x float>)
++
++define <8 x i32> @lasx_xvftintrz_w_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrz_w_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrz.w.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.s(<8 x float> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvftintrz.l.d(<4 x double>)
++
++define <4 x i64> @lasx_xvftintrz_l_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrz_l_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrz.l.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvftintrz.l.d(<4 x double> %va)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.s(<8 x float>)
++
++define <8 x i32> @lasx_xvftintrp_w_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrp_w_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrp.w.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.s(<8 x float> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvftintrp.l.d(<4 x double>)
++
++define <4 x i64> @lasx_xvftintrp_l_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrp_l_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrp.l.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvftintrp.l.d(<4 x double> %va)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.s(<8 x float>)
++
++define <8 x i32> @lasx_xvftintrm_w_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrm_w_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrm.w.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.s(<8 x float> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvftintrm.l.d(<4 x double>)
++
++define <4 x i64> @lasx_xvftintrm_l_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrm_l_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrm.l.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvftintrm.l.d(<4 x double> %va)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvftint.w.s(<8 x float>)
++
++define <8 x i32> @lasx_xvftint_w_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvftint_w_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftint.w.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvftint.w.s(<8 x float> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvftint.l.d(<4 x double>)
++
++define <4 x i64> @lasx_xvftint_l_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvftint_l_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftint.l.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvftint.l.d(<4 x double> %va)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvftintrz.wu.s(<8 x float>)
++
++define <8 x i32> @lasx_xvftintrz_wu_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrz_wu_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrz.wu.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvftintrz.wu.s(<8 x float> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvftintrz.lu.d(<4 x double>)
++
++define <4 x i64> @lasx_xvftintrz_lu_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrz_lu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrz.lu.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvftintrz.lu.d(<4 x double> %va)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvftint.wu.s(<8 x float>)
++
++define <8 x i32> @lasx_xvftint_wu_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvftint_wu_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftint.wu.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvftint.wu.s(<8 x float> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvftint.lu.d(<4 x double>)
++
++define <4 x i64> @lasx_xvftint_lu_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvftint_lu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftint.lu.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvftint.lu.d(<4 x double> %va)
++  ret <4 x i64> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.d(<4 x double>, <4 x double>)
++
++define <8 x i32> @lasx_xvftintrne_w_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvftintrne_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrne.w.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.d(<4 x double> %va, <4 x double> %vb)
++  ret <8 x i32> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.d(<4 x double>, <4 x double>)
++
++define <8 x i32> @lasx_xvftintrz_w_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvftintrz_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrz.w.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.d(<4 x double> %va, <4 x double> %vb)
++  ret <8 x i32> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.d(<4 x double>, <4 x double>)
++
++define <8 x i32> @lasx_xvftintrp_w_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvftintrp_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrp.w.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.d(<4 x double> %va, <4 x double> %vb)
++  ret <8 x i32> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.d(<4 x double>, <4 x double>)
++
++define <8 x i32> @lasx_xvftintrm_w_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvftintrm_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrm.w.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.d(<4 x double> %va, <4 x double> %vb)
++  ret <8 x i32> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvftint.w.d(<4 x double>, <4 x double>)
++
++define <8 x i32> @lasx_xvftint_w_d(<4 x double> %va, <4 x double> %vb) nounwind {
++; CHECK-LABEL: lasx_xvftint_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftint.w.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvftint.w.d(<4 x double> %va, <4 x double> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvftintrnel.l.s(<8 x float>)
++
++define <4 x i64> @lasx_xvftintrnel_l_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrnel_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrnel.l.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvftintrnel.l.s(<8 x float> %va)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvftintrneh.l.s(<8 x float>)
++
++define <4 x i64> @lasx_xvftintrneh_l_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrneh_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrneh.l.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvftintrneh.l.s(<8 x float> %va)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvftintrzl.l.s(<8 x float>)
++
++define <4 x i64> @lasx_xvftintrzl_l_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrzl_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrzl.l.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvftintrzl.l.s(<8 x float> %va)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvftintrzh.l.s(<8 x float>)
++
++define <4 x i64> @lasx_xvftintrzh_l_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrzh_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrzh.l.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvftintrzh.l.s(<8 x float> %va)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvftintrpl.l.s(<8 x float>)
++
++define <4 x i64> @lasx_xvftintrpl_l_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrpl_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrpl.l.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvftintrpl.l.s(<8 x float> %va)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvftintrph.l.s(<8 x float>)
++
++define <4 x i64> @lasx_xvftintrph_l_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrph_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrph.l.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvftintrph.l.s(<8 x float> %va)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvftintrml.l.s(<8 x float>)
++
++define <4 x i64> @lasx_xvftintrml_l_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrml_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrml.l.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvftintrml.l.s(<8 x float> %va)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvftintrmh.l.s(<8 x float>)
++
++define <4 x i64> @lasx_xvftintrmh_l_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintrmh_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintrmh.l.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvftintrmh.l.s(<8 x float> %va)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvftintl.l.s(<8 x float>)
++
++define <4 x i64> @lasx_xvftintl_l_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvftintl_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftintl.l.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvftintl.l.s(<8 x float> %va)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvftinth.l.s(<8 x float>)
++
++define <4 x i64> @lasx_xvftinth_l_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvftinth_l_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvftinth.l.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvftinth.l.s(<8 x float> %va)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-haddw.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-haddw.ll
+new file mode 100644
+index 000000000000..2c64ab23806b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-haddw.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <16 x i16> @llvm.loongarch.lasx.xvhaddw.h.b(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvhaddw_h_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvhaddw_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvhaddw.h.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvhaddw.h.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvhaddw.w.h(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvhaddw_w_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvhaddw_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvhaddw.w.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvhaddw.w.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvhaddw.d.w(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvhaddw_d_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvhaddw_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvhaddw.d.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvhaddw.d.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvhaddw.q.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvhaddw_q_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvhaddw_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvhaddw.q.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvhaddw.q.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvhaddw.hu.bu(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvhaddw_hu_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvhaddw_hu_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvhaddw.hu.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvhaddw.hu.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvhaddw.wu.hu(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvhaddw_wu_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvhaddw_wu_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvhaddw.wu.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvhaddw.wu.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvhaddw.du.wu(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvhaddw_du_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvhaddw_du_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvhaddw.du.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvhaddw.du.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvhaddw.qu.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvhaddw_qu_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvhaddw_qu_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvhaddw.qu.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvhaddw.qu.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-hsubw.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-hsubw.ll
+new file mode 100644
+index 000000000000..a5223c1d89a0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-hsubw.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <16 x i16> @llvm.loongarch.lasx.xvhsubw.h.b(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvhsubw_h_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvhsubw_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvhsubw.h.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvhsubw.h.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvhsubw.w.h(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvhsubw_w_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvhsubw_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvhsubw.w.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvhsubw.w.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvhsubw.d.w(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvhsubw_d_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvhsubw_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvhsubw.d.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvhsubw.d.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvhsubw.q.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvhsubw_q_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvhsubw_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvhsubw.q.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvhsubw.q.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvhsubw.hu.bu(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvhsubw_hu_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvhsubw_hu_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvhsubw.hu.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvhsubw.hu.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvhsubw.wu.hu(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvhsubw_wu_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvhsubw_wu_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvhsubw.wu.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvhsubw.wu.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvhsubw.du.wu(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvhsubw_du_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvhsubw_du_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvhsubw.du.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvhsubw.du.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvhsubw.qu.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvhsubw_qu_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvhsubw_qu_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvhsubw.qu.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvhsubw.qu.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ilv.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ilv.ll
+new file mode 100644
+index 000000000000..c9d0ca6b0324
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ilv.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvilvl.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvilvl_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvilvl_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvilvl.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvilvl.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvilvl.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvilvl_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvilvl_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvilvl.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvilvl.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvilvl.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvilvl_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvilvl_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvilvl.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvilvl.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvilvl.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvilvl_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvilvl_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvilvl.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvilvl.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvilvh.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvilvh_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvilvh_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvilvh.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvilvh.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvilvh.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvilvh_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvilvh_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvilvh.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvilvh.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvilvh.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvilvh_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvilvh_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvilvh.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvilvh.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvilvh.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvilvh_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvilvh_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvilvh.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvilvh.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insgr2vr.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insgr2vr.ll
+new file mode 100644
+index 000000000000..ea98c96464ae
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insgr2vr.ll
+@@ -0,0 +1,28 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32>, i32, i32)
++
++define <8 x i32> @lasx_xvinsgr2vr_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvinsgr2vr_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    ori $a0, $zero, 1
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> %va, i32 1, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64>, i64, i32)
++
++define <4 x i64> @lasx_xvinsgr2vr_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvinsgr2vr_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    ori $a0, $zero, 1
++; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> %va, i64 1, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insve0.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insve0.ll
+new file mode 100644
+index 000000000000..27ae819c4144
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insve0.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvinsve0_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvinsve0_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvinsve0.w $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> %va, <8 x i32> %vb, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvinsve0_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvinsve0_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvinsve0.d $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> %va, <4 x i64> %vb, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ld.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ld.ll
+new file mode 100644
+index 000000000000..5ffc629db466
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ld.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvld(i8*, i32)
++
++define <32 x i8> @lasx_xvld(i8* %p) nounwind {
++; CHECK-LABEL: lasx_xvld:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvld(i8* %p, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvldx(i8*, i64)
++
++define <32 x i8> @lasx_xvldx(i8* %p, i64 %b) nounwind {
++; CHECK-LABEL: lasx_xvldx:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvldx $xr0, $a0, $a1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvldx(i8* %p, i64 %b)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldi.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldi.ll
+new file mode 100644
+index 000000000000..59f79dd32af3
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldi.ll
+@@ -0,0 +1,62 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <4 x i64> @llvm.loongarch.lasx.xvldi(i32)
++
++define <4 x i64> @lasx_xvldi() nounwind {
++; CHECK-LABEL: lasx_xvldi:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvldi $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvldi(i32 1)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvrepli.b(i32)
++
++define <32 x i8> @lasx_xvrepli_b() nounwind {
++; CHECK-LABEL: lasx_xvrepli_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrepli.b $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvrepli.b(i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvrepli.h(i32)
++
++define <16 x i16> @lasx_xvrepli_h() nounwind {
++; CHECK-LABEL: lasx_xvrepli_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrepli.h $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvrepli.h(i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvrepli.w(i32)
++
++define <8 x i32> @lasx_xvrepli_w() nounwind {
++; CHECK-LABEL: lasx_xvrepli_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrepli.w $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvrepli.w(i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvrepli.d(i32)
++
++define <4 x i64> @lasx_xvrepli_d() nounwind {
++; CHECK-LABEL: lasx_xvrepli_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrepli.d $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvrepli.d(i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldrepl.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldrepl.ll
+new file mode 100644
+index 000000000000..ae6abdf81cbc
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldrepl.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(i8*, i32)
++
++define <32 x i8> @lasx_xvldrepl_b(i8* %p) nounwind {
++; CHECK-LABEL: lasx_xvldrepl_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvldrepl.b $xr0, $a0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(i8* %p, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(i8*, i32)
++
++define <16 x i16> @lasx_xvldrepl_h(i8* %p) nounwind {
++; CHECK-LABEL: lasx_xvldrepl_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvldrepl.h $xr0, $a0, 2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(i8* %p, i32 2)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(i8*, i32)
++
++define <8 x i32> @lasx_xvldrepl_w(i8* %p) nounwind {
++; CHECK-LABEL: lasx_xvldrepl_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvldrepl.w $xr0, $a0, 4
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(i8* %p, i32 4)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(i8*, i32)
++
++define <4 x i64> @lasx_xvldrepl_d(i8* %p) nounwind {
++; CHECK-LABEL: lasx_xvldrepl_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvldrepl.d $xr0, $a0, 8
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(i8* %p, i32 8)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-madd.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-madd.ll
+new file mode 100644
+index 000000000000..d3b09396727e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-madd.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmadd.b(<32 x i8>, <32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvmadd_b(<32 x i8> %va, <32 x i8> %vb, <32 x i8> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmadd_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmadd.b $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmadd.b(<32 x i8> %va, <32 x i8> %vb, <32 x i8> %vc)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmadd.h(<16 x i16>, <16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvmadd_h(<16 x i16> %va, <16 x i16> %vb, <16 x i16> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmadd_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmadd.h $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmadd.h(<16 x i16> %va, <16 x i16> %vb, <16 x i16> %vc)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmadd.w(<8 x i32>, <8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvmadd_w(<8 x i32> %va, <8 x i32> %vb, <8 x i32> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmadd_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmadd.w $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmadd.w(<8 x i32> %va, <8 x i32> %vb, <8 x i32> %vc)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmadd.d(<4 x i64>, <4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmadd_d(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmadd_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmadd.d $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmadd.d(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-maddw.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-maddw.ll
+new file mode 100644
+index 000000000000..146624a764a2
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-maddw.ll
+@@ -0,0 +1,290 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.b(<16 x i16>, <32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvmaddwev_h_b(<16 x i16> %va, <32 x i8> %vb, <32 x i8> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwev_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwev.h.b $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.b(<16 x i16> %va, <32 x i8> %vb, <32 x i8> %vc)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.h(<8 x i32>, <16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvmaddwev_w_h(<8 x i32> %va, <16 x i16> %vb, <16 x i16> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwev_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwev.w.h $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.h(<8 x i32> %va, <16 x i16> %vb, <16 x i16> %vc)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.w(<4 x i64>, <8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvmaddwev_d_w(<4 x i64> %va, <8 x i32> %vb, <8 x i32> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwev_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwev.d.w $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.w(<4 x i64> %va, <8 x i32> %vb, <8 x i32> %vc)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.d(<4 x i64>, <4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmaddwev_q_d(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwev_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwev.q.d $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.d(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu(<16 x i16>, <32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvmaddwev_h_bu(<16 x i16> %va, <32 x i8> %vb, <32 x i8> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwev_h_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwev.h.bu $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu(<16 x i16> %va, <32 x i8> %vb, <32 x i8> %vc)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu(<8 x i32>, <16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvmaddwev_w_hu(<8 x i32> %va, <16 x i16> %vb, <16 x i16> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwev_w_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwev.w.hu $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu(<8 x i32> %va, <16 x i16> %vb, <16 x i16> %vc)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu(<4 x i64>, <8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvmaddwev_d_wu(<4 x i64> %va, <8 x i32> %vb, <8 x i32> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwev_d_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwev.d.wu $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu(<4 x i64> %va, <8 x i32> %vb, <8 x i32> %vc)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du(<4 x i64>, <4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmaddwev_q_du(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwev_q_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwev.q.du $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu.b(<16 x i16>, <32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvmaddwev_h_bu_b(<16 x i16> %va, <32 x i8> %vb, <32 x i8> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwev_h_bu_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwev.h.bu.b $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu.b(<16 x i16> %va, <32 x i8> %vb, <32 x i8> %vc)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu.h(<8 x i32>, <16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvmaddwev_w_hu_h(<8 x i32> %va, <16 x i16> %vb, <16 x i16> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwev_w_hu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwev.w.hu.h $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu.h(<8 x i32> %va, <16 x i16> %vb, <16 x i16> %vc)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu.w(<4 x i64>, <8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvmaddwev_d_wu_w(<4 x i64> %va, <8 x i32> %vb, <8 x i32> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwev_d_wu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwev.d.wu.w $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu.w(<4 x i64> %va, <8 x i32> %vb, <8 x i32> %vc)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du.d(<4 x i64>, <4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmaddwev_q_du_d(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwev_q_du_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwev.q.du.d $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du.d(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.b(<16 x i16>, <32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvmaddwod_h_b(<16 x i16> %va, <32 x i8> %vb, <32 x i8> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwod_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwod.h.b $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.b(<16 x i16> %va, <32 x i8> %vb, <32 x i8> %vc)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.h(<8 x i32>, <16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvmaddwod_w_h(<8 x i32> %va, <16 x i16> %vb, <16 x i16> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwod_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwod.w.h $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.h(<8 x i32> %va, <16 x i16> %vb, <16 x i16> %vc)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.w(<4 x i64>, <8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvmaddwod_d_w(<4 x i64> %va, <8 x i32> %vb, <8 x i32> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwod_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwod.d.w $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.w(<4 x i64> %va, <8 x i32> %vb, <8 x i32> %vc)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.d(<4 x i64>, <4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmaddwod_q_d(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwod_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwod.q.d $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.d(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu(<16 x i16>, <32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvmaddwod_h_bu(<16 x i16> %va, <32 x i8> %vb, <32 x i8> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwod_h_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwod.h.bu $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu(<16 x i16> %va, <32 x i8> %vb, <32 x i8> %vc)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu(<8 x i32>, <16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvmaddwod_w_hu(<8 x i32> %va, <16 x i16> %vb, <16 x i16> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwod_w_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwod.w.hu $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu(<8 x i32> %va, <16 x i16> %vb, <16 x i16> %vc)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu(<4 x i64>, <8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvmaddwod_d_wu(<4 x i64> %va, <8 x i32> %vb, <8 x i32> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwod_d_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwod.d.wu $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu(<4 x i64> %va, <8 x i32> %vb, <8 x i32> %vc)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du(<4 x i64>, <4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmaddwod_q_du(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwod_q_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwod.q.du $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu.b(<16 x i16>, <32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvmaddwod_h_bu_b(<16 x i16> %va, <32 x i8> %vb, <32 x i8> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwod_h_bu_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwod.h.bu.b $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu.b(<16 x i16> %va, <32 x i8> %vb, <32 x i8> %vc)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu.h(<8 x i32>, <16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvmaddwod_w_hu_h(<8 x i32> %va, <16 x i16> %vb, <16 x i16> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwod_w_hu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwod.w.hu.h $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu.h(<8 x i32> %va, <16 x i16> %vb, <16 x i16> %vc)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu.w(<4 x i64>, <8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvmaddwod_d_wu_w(<4 x i64> %va, <8 x i32> %vb, <8 x i32> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwod_d_wu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwod.d.wu.w $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu.w(<4 x i64> %va, <8 x i32> %vb, <8 x i32> %vc)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du.d(<4 x i64>, <4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmaddwod_q_du_d(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmaddwod_q_du_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaddwod.q.du.d $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du.d(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-max.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-max.ll
+new file mode 100644
+index 000000000000..9cf09df4439a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-max.ll
+@@ -0,0 +1,194 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmax.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvmax_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmax_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmax.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmax.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvmax_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmax_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmax.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmax.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvmax_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmax_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmax.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmax.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmax.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmax_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmax_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmax.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmax.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvmaxi_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvmaxi_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaxi.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvmaxi_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvmaxi_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaxi.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvmaxi_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvmaxi_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaxi.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvmaxi_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvmaxi_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaxi.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmax.bu(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_vmax_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_vmax_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmax.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmax.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmax.hu(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvmax_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmax_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmax.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmax.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmax.wu(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvmax_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmax_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmax.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmax.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmax.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmax_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmax_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmax.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmax.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvmaxi_bu(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvmaxi_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaxi.bu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvmaxi_hu(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvmaxi_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaxi.hu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvmaxi_wu(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvmaxi_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaxi.wu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvmaxi_du(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvmaxi_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmaxi.du $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-min.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-min.ll
+new file mode 100644
+index 000000000000..c94b1e4ea44c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-min.ll
+@@ -0,0 +1,194 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmin.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvmin_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmin_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmin.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmin.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmin.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvmin_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmin_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmin.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmin.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmin.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvmin_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmin_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmin.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmin.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmin.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmin_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmin_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmin.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmin.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvmini_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvmini_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmini.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvmini_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvmini_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmini.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvmini_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvmini_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmini.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvmini_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvmini_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmini.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmin.bu(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvmin_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmin_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmin.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmin.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmin.hu(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvmin_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmin_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmin.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmin.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmin.wu(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvmin_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmin_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmin.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmin.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmin.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmin_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmin_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmin.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmin.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvmini_bu(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvmini_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmini.bu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvmini_hu(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvmini_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmini.hu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvmini_wu(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvmini_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmini.wu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvmini_du(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvmini_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmini.du $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-mod.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-mod.ll
+new file mode 100644
+index 000000000000..a177246bb235
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-mod.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmod.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvmod_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmod_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmod.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmod.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmod.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvmod_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmod_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmod.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmod.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmod.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvmod_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmod_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmod.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmod.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmod.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmod_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmod_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmod.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmod.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmod.bu(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvmod_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmod_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmod.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmod.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmod.hu(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvmod_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmod_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmod.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmod.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmod.wu(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvmod_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmod_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmod.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmod.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmod.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmod_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmod_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmod.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmod.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-mskgez.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-mskgez.ll
+new file mode 100644
+index 000000000000..da87c20ad6ee
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-mskgez.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmskgez.b(<32 x i8>)
++
++define <32 x i8> @lasx_xvmskgez_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvmskgez_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmskgez.b $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmskgez.b(<32 x i8> %va)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-mskltz.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-mskltz.ll
+new file mode 100644
+index 000000000000..b2218487535c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-mskltz.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmskltz.b(<32 x i8>)
++
++define <32 x i8> @lasx_xvmskltz_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvmskltz_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmskltz.b $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmskltz.b(<32 x i8> %va)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmskltz.h(<16 x i16>)
++
++define <16 x i16> @lasx_xvmskltz_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvmskltz_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmskltz.h $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmskltz.h(<16 x i16> %va)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmskltz.w(<8 x i32>)
++
++define <8 x i32> @lasx_xvmskltz_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvmskltz_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmskltz.w $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmskltz.w(<8 x i32> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmskltz.d(<4 x i64>)
++
++define <4 x i64> @lasx_xvmskltz_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvmskltz_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmskltz.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmskltz.d(<4 x i64> %va)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-msknz.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-msknz.ll
+new file mode 100644
+index 000000000000..becd2c883a7e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-msknz.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmsknz.b(<32 x i8>)
++
++define <32 x i8> @lasx_xvmsknz_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvmsknz_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmsknz.b $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmsknz.b(<32 x i8> %va)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-msub.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-msub.ll
+new file mode 100644
+index 000000000000..c89f9578b77d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-msub.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmsub.b(<32 x i8>, <32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvmsub_b(<32 x i8> %va, <32 x i8> %vb, <32 x i8> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmsub_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmsub.b $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmsub.b(<32 x i8> %va, <32 x i8> %vb, <32 x i8> %vc)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmsub.h(<16 x i16>, <16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvmsub_h(<16 x i16> %va, <16 x i16> %vb, <16 x i16> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmsub_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmsub.h $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmsub.h(<16 x i16> %va, <16 x i16> %vb, <16 x i16> %vc)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmsub.w(<8 x i32>, <8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvmsub_w(<8 x i32> %va, <8 x i32> %vb, <8 x i32> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmsub_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmsub.w $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmsub.w(<8 x i32> %va, <8 x i32> %vb, <8 x i32> %vc)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmsub.d(<4 x i64>, <4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmsub_d(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc) nounwind {
++; CHECK-LABEL: lasx_xvmsub_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmsub.d $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmsub.d(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-muh.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-muh.ll
+new file mode 100644
+index 000000000000..97461512ce16
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-muh.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmuh.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvmuh_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmuh_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmuh.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmuh.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmuh.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvmuh_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmuh_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmuh.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmuh.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmuh.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvmuh_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmuh_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmuh.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmuh.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmuh.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmuh_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmuh_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmuh.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmuh.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmuh.bu(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvmuh_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmuh_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmuh.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmuh.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmuh.hu(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvmuh_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmuh_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmuh.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmuh.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmuh.wu(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvmuh_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmuh_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmuh.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmuh.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmuh.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmuh_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmuh_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmuh.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmuh.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-mul.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-mul.ll
+new file mode 100644
+index 000000000000..d5d852e58a9f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-mul.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmul.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvmul_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmul_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmul.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmul.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmul.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvmul_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmul_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmul.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmul.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmul.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvmul_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmul_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmul.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmul.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmul.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmul_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmul_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmul.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmul.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-mulw.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-mulw.ll
+new file mode 100644
+index 000000000000..f69e64aa7698
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-mulw.ll
+@@ -0,0 +1,290 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.b(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvmulwev_h_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwev_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwev.h.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.h(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvmulwev_w_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwev_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwev.w.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.w(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvmulwev_d_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwev_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwev.d.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmulwev_q_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwev_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwev.q.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvmulwev_h_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwev_h_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwev.h.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvmulwev_w_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwev_w_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwev.w.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvmulwev_d_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwev_d_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwev.d.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmulwev_q_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwev_q_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwev.q.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu.b(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvmulwev_h_bu_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwev_h_bu_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwev.h.bu.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu.h(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvmulwev_w_hu_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwev_w_hu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwev.w.hu.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu.w(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvmulwev_d_wu_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwev_d_wu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwev.d.wu.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmulwev_q_du_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwev_q_du_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwev.q.du.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.b(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvmulwod_h_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwod_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwod.h.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.h(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvmulwod_w_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwod_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwod.w.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.w(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvmulwod_d_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwod_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwod.d.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmulwod_q_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwod_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwod.q.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvmulwod_h_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwod_h_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwod.h.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvmulwod_w_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwod_w_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwod.w.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvmulwod_d_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwod_d_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwod.d.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmulwod_q_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwod_q_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwod.q.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu.b(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvmulwod_h_bu_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwod_h_bu_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwod.h.bu.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu.h(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvmulwod_w_hu_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwod_w_hu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwod.w.hu.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu.w(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvmulwod_d_wu_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwod_d_wu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwod.d.wu.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvmulwod_q_du_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvmulwod_q_du_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvmulwod.q.du.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-neg.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-neg.ll
+new file mode 100644
+index 000000000000..ecbedf334657
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-neg.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvneg.b(<32 x i8>)
++
++define <32 x i8> @lasx_xvneg_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvneg_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvneg.b $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvneg.b(<32 x i8> %va)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvneg.h(<16 x i16>)
++
++define <16 x i16> @lasx_xvneg_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvneg_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvneg.h $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvneg.h(<16 x i16> %va)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvneg.w(<8 x i32>)
++
++define <8 x i32> @lasx_xvneg_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvneg_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvneg.w $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvneg.w(<8 x i32> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvneg.d(<4 x i64>)
++
++define <4 x i64> @lasx_xvneg_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvneg_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvneg.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvneg.d(<4 x i64> %va)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-nor.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-nor.ll
+new file mode 100644
+index 000000000000..674746b7624e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-nor.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvnor.v(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvnor_v(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvnor_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvnor.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvnor.v(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-nori.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-nori.ll
+new file mode 100644
+index 000000000000..55eebf87ee92
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-nori.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvnori_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvnori_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvnori.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-or.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-or.ll
+new file mode 100644
+index 000000000000..16462cfafc54
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-or.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvor.v(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvor_v(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvor_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvor.v(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ori.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ori.ll
+new file mode 100644
+index 000000000000..8e53d88bac37
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ori.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvori_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvori_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvori.b $xr0, $xr0, 3
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> %va, i32 3)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-orn.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-orn.ll
+new file mode 100644
+index 000000000000..3a335cdd3716
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-orn.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvorn.v(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvorn_v(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvorn_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvorn.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvorn.v(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pack.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pack.ll
+new file mode 100644
+index 000000000000..512b30234917
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pack.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvpackev.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvpackev_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpackev_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpackev.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvpackev.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvpackev.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvpackev_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpackev_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpackev.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvpackev.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvpackev.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvpackev_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpackev_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpackev.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvpackev.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvpackev.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvpackev_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpackev_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpackev.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvpackev.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvpackod.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvpackod_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpackod_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpackod.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvpackod.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvpackod.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvpackod_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpackod_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpackod.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvpackod.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvpackod.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvpackod_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpackod_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpackod.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvpackod.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvpackod.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvpackod_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpackod_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpackod.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvpackod.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pcnt.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pcnt.ll
+new file mode 100644
+index 000000000000..d77f1d2082c8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pcnt.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvpcnt.b(<32 x i8>)
++
++define <32 x i8> @lasx_xvpcnt_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvpcnt_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpcnt.b $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvpcnt.b(<32 x i8> %va)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvpcnt.h(<16 x i16>)
++
++define <16 x i16> @lasx_xvpcnt_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvpcnt_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpcnt.h $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvpcnt.h(<16 x i16> %va)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvpcnt.w(<8 x i32>)
++
++define <8 x i32> @lasx_xvpcnt_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvpcnt_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpcnt.w $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvpcnt.w(<8 x i32> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvpcnt.d(<4 x i64>)
++
++define <4 x i64> @lasx_xvpcnt_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvpcnt_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpcnt.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvpcnt.d(<4 x i64> %va)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-perm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-perm.ll
+new file mode 100644
+index 000000000000..4ec434edd4ec
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-perm.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x i32> @llvm.loongarch.lasx.xvperm.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvperm_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvperm_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvperm.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvperm.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi.ll
+new file mode 100644
+index 000000000000..0d9f9daabc44
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi.ll
+@@ -0,0 +1,38 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvpermi_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpermi_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpermi.w $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> %va, <8 x i32> %vb, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvpermi_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvpermi_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvpermi_q(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpermi_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> %va, <32 x i8> %vb, i32 1)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pick.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pick.ll
+new file mode 100644
+index 000000000000..bbd6d693ca0b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pick.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvpickev.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvpickev_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpickev_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpickev.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvpickev.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvpickev.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvpickev_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpickev_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpickev.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvpickev.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvpickev.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvpickev_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpickev_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpickev.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvpickev.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvpickev.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvpickev_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpickev_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpickev.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvpickev.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvpickod.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvpickod_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpickod_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpickod.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvpickod.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvpickod.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvpickod_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpickod_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpickod.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvpickod.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvpickod.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvpickod_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpickod_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpickod.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvpickod.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvpickod.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvpickod_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvpickod_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpickod.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvpickod.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve.ll
+new file mode 100644
+index 000000000000..546777bc72ab
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvpickve_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvpickve_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpickve.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvpickve_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvpickve_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpickve.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
++
++declare <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float>, i32)
++
++define <8 x float> @lasx_xvpickve_w_f(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvpickve_w_f:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpickve.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> %va, i32 1)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double>, i32)
++
++define <4 x double> @lasx_xvpickve_d_f(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvpickve_d_f:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpickve.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> %va, i32 1)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr.ll
+new file mode 100644
+index 000000000000..0617e7424321
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr.ll
+@@ -0,0 +1,53 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++
++
++
++declare i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32>, i32)
++
++define i32 @lasx_xvpickve2gr_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvpickve2gr_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> %va, i32 1)
++  ret i32 %res
++}
++
++declare i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64>, i32)
++
++define i64 @lasx_xvpickve2gr_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvpickve2gr_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> %va, i32 1)
++  ret i64 %res
++}
++
++declare i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32>, i32)
++
++define i32 @lasx_xvpickve2gr_wu(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvpickve2gr_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpickve2gr.wu $a0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> %va, i32 1)
++  ret i32 %res
++}
++
++declare i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64>, i32)
++
++define i64 @lasx_xvpickve2gr_du(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvpickve2gr_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvpickve2gr.du $a0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> %va, i32 1)
++  ret i64 %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl128vei.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl128vei.ll
+new file mode 100644
+index 000000000000..25fab44f461f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl128vei.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvrepl128vei_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvrepl128vei_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrepl128vei.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvrepl128vei_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvrepl128vei_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrepl128vei.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvrepl128vei_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvrepl128vei_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvrepl128vei_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvrepl128vei_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr.ll
+new file mode 100644
+index 000000000000..c71abd2205c6
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvreplgr2vr.b(i32)
++
++define <32 x i8> @lasx_xvreplgr2vr_b(i32 %a) nounwind {
++; CHECK-LABEL: lasx_xvreplgr2vr_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvreplgr2vr.b $xr0, $a0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvreplgr2vr.b(i32 %a)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvreplgr2vr.h(i32)
++
++define <16 x i16> @lasx_xvreplgr2vr_h(i32 %a) nounwind {
++; CHECK-LABEL: lasx_xvreplgr2vr_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvreplgr2vr.h $xr0, $a0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvreplgr2vr.h(i32 %a)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32)
++
++define <8 x i32> @lasx_xvreplgr2vr_w(i32 %a) nounwind {
++; CHECK-LABEL: lasx_xvreplgr2vr_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvreplgr2vr.w $xr0, $a0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32 %a)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64)
++
++define <4 x i64> @lasx_xvreplgr2vr_d(i64 %a) nounwind {
++; CHECK-LABEL: lasx_xvreplgr2vr_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvreplgr2vr.d $xr0, $a0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 %a)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replve.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replve.ll
+new file mode 100644
+index 000000000000..21d36ff7bb5e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replve.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvreplve_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK-LABEL: lasx_xvreplve_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvreplve.b $xr0, $xr0, $a0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvreplve_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK-LABEL: lasx_xvreplve_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvreplve.h $xr0, $xr0, $a0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvreplve_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK-LABEL: lasx_xvreplve_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvreplve.w $xr0, $xr0, $a0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvreplve_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK-LABEL: lasx_xvreplve_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvreplve.d $xr0, $xr0, $a0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replve0.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replve0.ll
+new file mode 100644
+index 000000000000..7996bb36ef03
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replve0.ll
+@@ -0,0 +1,62 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvreplve0.b(<32 x i8>)
++
++define <32 x i8> @lasx_xvreplve0_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvreplve0_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvreplve0.b $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvreplve0.b(<32 x i8> %va)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvreplve0.h(<16 x i16>)
++
++define <16 x i16> @lasx_xvreplve0_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvreplve0_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvreplve0.h $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvreplve0.h(<16 x i16> %va)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvreplve0.w(<8 x i32>)
++
++define <8 x i32> @lasx_xvreplve0_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvreplve0_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvreplve0.w $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvreplve0.w(<8 x i32> %va)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvreplve0.d(<4 x i64>)
++
++define <4 x i64> @lasx_xvreplve0_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvreplve0_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvreplve0.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvreplve0.d(<4 x i64> %va)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvreplve0.q(<32 x i8>)
++
++define <32 x i8> @lasx_xvreplve0_q(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvreplve0_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvreplve0.q $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvreplve0.q(<32 x i8> %va)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-rotr.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-rotr.ll
+new file mode 100644
+index 000000000000..64d2773864e9
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-rotr.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvrotr.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvrotr_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvrotr_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrotr.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvrotr.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvrotr.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvrotr_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvrotr_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrotr.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvrotr.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvrotr.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvrotr_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvrotr_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrotr.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvrotr.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvrotr.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvrotr_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvrotr_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrotr.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvrotr.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvrotri_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvrotri_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrotri.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvrotri_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvrotri_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrotri.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvrotri_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvrotri_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrotri.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvrotri_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvrotri_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrotri.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sadd.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sadd.ll
+new file mode 100644
+index 000000000000..54a5e2e9c833
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sadd.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsadd.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvsadd_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsadd_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsadd.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsadd.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsadd.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvsadd_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsadd_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsadd.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsadd.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsadd.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvsadd_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsadd_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsadd.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsadd.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsadd.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvsadd_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsadd_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsadd.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsadd.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsadd.bu(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvsadd_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsadd_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsadd.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsadd.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsadd.hu(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvsadd_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsadd_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsadd.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsadd.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsadd.wu(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvsadd_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsadd_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsadd.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsadd.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsadd.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvsadd_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsadd_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsadd.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsadd.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sat.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sat.ll
+new file mode 100644
+index 000000000000..293b9dc9eb4d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sat.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsat_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvsat_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsat.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsat_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvsat_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsat.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsat_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvsat_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsat.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsat_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvsat_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsat.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsat_bu(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvsat_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsat.bu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsat_hu(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvsat_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsat.hu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsat_wu(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvsat_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsat.wu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsat_du(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvsat_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsat.du $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-seq.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-seq.ll
+new file mode 100644
+index 000000000000..83bc93c88c73
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-seq.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvseq.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvseq_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvseq_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvseq.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvseq.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvseq.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvseq_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvseq_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvseq.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvseq.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvseq.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvseq_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvseq_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvseq.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvseq.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvseq.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvseq_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvseq_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvseq.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvseq.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvseqi_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvseqi_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvseqi.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvseqi_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvseqi_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvseqi.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvseqi_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvseqi_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvseqi.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvseqi_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvseqi_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvseqi.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-set.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-set.ll
+new file mode 100644
+index 000000000000..6e3e2e0330f5
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-set.ll
+@@ -0,0 +1,38 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare i32 @llvm.loongarch.lasx.xbz.v(<32 x i8>)
++
++define i32 @lasx_xbz_v(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xbz_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvseteqz.v $fcc0, $xr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB0_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB0_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xbz.v(<32 x i8> %va)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lasx.xbnz.v(<32 x i8>)
++
++define i32 @lasx_xbnz_v(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xbnz_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsetnez.v $fcc0, $xr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB1_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB1_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xbnz.v(<32 x i8> %va)
++  ret i32 %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setallnez.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setallnez.ll
+new file mode 100644
+index 000000000000..a466b78bf8d2
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setallnez.ll
+@@ -0,0 +1,74 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8>)
++
++define i32 @lasx_xbnz_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xbnz_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsetallnez.b $fcc0, $xr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB0_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB0_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8> %va)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lasx.xbnz.h(<16 x i16>)
++
++define i32 @lasx_xbnz_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xbnz_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsetallnez.h $fcc0, $xr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB1_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB1_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xbnz.h(<16 x i16> %va)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lasx.xbnz.w(<8 x i32>)
++
++define i32 @lasx_xbnz_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xbnz_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsetallnez.w $fcc0, $xr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB2_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB2_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xbnz.w(<8 x i32> %va)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lasx.xbnz.d(<4 x i64>)
++
++define i32 @lasx_xbnz_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xbnz_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsetallnez.d $fcc0, $xr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB3_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB3_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xbnz.d(<4 x i64> %va)
++  ret i32 %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setanyeqz.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setanyeqz.ll
+new file mode 100644
+index 000000000000..36e65fc5b328
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setanyeqz.ll
+@@ -0,0 +1,74 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare i32 @llvm.loongarch.lasx.xbz.b(<32 x i8>)
++
++define i32 @lasx_xbz_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xbz_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsetanyeqz.b $fcc0, $xr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB0_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB0_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xbz.b(<32 x i8> %va)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lasx.xbz.h(<16 x i16>)
++
++define i32 @lasx_xbz_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xbz_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsetanyeqz.h $fcc0, $xr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB1_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB1_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xbz.h(<16 x i16> %va)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lasx.xbz.w(<8 x i32>)
++
++define i32 @lasx_xbz_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xbz_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsetanyeqz.w $fcc0, $xr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB2_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB2_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xbz.w(<8 x i32> %va)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lasx.xbz.d(<4 x i64>)
++
++define i32 @lasx_xbz_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xbz_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsetanyeqz.d $fcc0, $xr0
++; CHECK-NEXT:    bcnez $fcc0, .LBB3_2
++; CHECK-NEXT:  # %bb.1: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 0
++; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB3_2: # %entry
++; CHECK-NEXT:    addi.w $a0, $zero, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xbz.d(<4 x i64> %va)
++  ret i32 %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-shuf.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-shuf.ll
+new file mode 100644
+index 000000000000..9b9140f6ad62
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-shuf.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvshuf.b(<32 x i8>, <32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvshuf_b(<32 x i8> %va, <32 x i8> %vb, <32 x i8> %vc) nounwind {
++; CHECK-LABEL: lasx_xvshuf_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvshuf.b $xr0, $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvshuf.b(<32 x i8> %va, <32 x i8> %vb, <32 x i8> %vc)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvshuf.h(<16 x i16>, <16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvshuf_h(<16 x i16> %va, <16 x i16> %vb, <16 x i16> %vc) nounwind {
++; CHECK-LABEL: lasx_xvshuf_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvshuf.h $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvshuf.h(<16 x i16> %va, <16 x i16> %vb, <16 x i16> %vc)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvshuf.w(<8 x i32>, <8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvshuf_w(<8 x i32> %va, <8 x i32> %vb, <8 x i32> %vc) nounwind {
++; CHECK-LABEL: lasx_xvshuf_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvshuf.w $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvshuf.w(<8 x i32> %va, <8 x i32> %vb, <8 x i32> %vc)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvshuf.d(<4 x i64>, <4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvshuf_d(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc) nounwind {
++; CHECK-LABEL: lasx_xvshuf_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvshuf.d $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvshuf.d(<4 x i64> %va, <4 x i64> %vb, <4 x i64> %vc)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-shuf4i.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-shuf4i.ll
+new file mode 100644
+index 000000000000..31205086759c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-shuf4i.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvshuf4i_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvshuf4i_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvshuf4i.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvshuf4i_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvshuf4i_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvshuf4i.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvshuf4i_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvshuf4i_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvshuf4i_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvshuf4i_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvshuf4i.d $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> %va, <4 x i64> %vb, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-signcov.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-signcov.ll
+new file mode 100644
+index 000000000000..e6c6d8ccd0d3
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-signcov.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsigncov.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvsigncov_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsigncov_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsigncov.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsigncov.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsigncov.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvsigncov_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsigncov_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsigncov.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsigncov.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsigncov.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvsigncov_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsigncov_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsigncov.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsigncov.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsigncov.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvsigncov_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsigncov_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsigncov.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsigncov.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sle.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sle.ll
+new file mode 100644
+index 000000000000..8895efc84b84
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sle.ll
+@@ -0,0 +1,194 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsle.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvsle_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsle_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsle.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsle.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsle.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvsle_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsle_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsle.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsle.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsle.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvsle_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsle_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsle.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsle.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsle.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvsle_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsle_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsle.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsle.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvslei_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvslei_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslei.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvslei_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvslei_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslei.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvslei_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvslei_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslei.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvslei_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvslei_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslei.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsle.bu(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvsle_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsle_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsle.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsle.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsle.hu(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvsle_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsle_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsle.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsle.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsle.wu(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvsle_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsle_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsle.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsle.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsle.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvsle_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsle_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsle.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsle.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvslei_bu(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvslei_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslei.bu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvslei_hu(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvslei_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslei.hu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvslei_wu(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvslei_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslei.wu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvslei_du(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvslei_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslei.du $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sll.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sll.ll
+new file mode 100644
+index 000000000000..14110b613dbe
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sll.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsll.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvsll_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsll_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsll.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsll.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsll.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvsll_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsll_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsll.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsll.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsll.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvsll_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsll_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsll.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsll.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsll.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvsll_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsll_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsll.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsll.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvslli_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvslli_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslli.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvslli_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvslli_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslli.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvslli_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvslli_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslli.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvslli_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvslli_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslli.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sllwil.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sllwil.ll
+new file mode 100644
+index 000000000000..a72b8a6cbb4f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sllwil.ll
+@@ -0,0 +1,74 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8>, i32)
++
++define <16 x i16> @lasx_xvsllwil_h_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvsllwil_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsllwil.h.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16>, i32)
++
++define <8 x i32> @lasx_xvsllwil_w_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvsllwil_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsllwil.w.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32>, i32)
++
++define <4 x i64> @lasx_xvsllwil_d_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvsllwil_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsllwil.d.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> %va, i32 1)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8>, i32)
++
++define <16 x i16> @lasx_xvsllwil_hu_bu(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvsllwil_hu_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsllwil.hu.bu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16>, i32)
++
++define <8 x i32> @lasx_xvsllwil_wu_hu(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvsllwil_wu_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsllwil.wu.hu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32>, i32)
++
++define <4 x i64> @lasx_xvsllwil_du_wu(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvsllwil_du_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsllwil.du.wu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-slt.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-slt.ll
+new file mode 100644
+index 000000000000..3ea87adff110
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-slt.ll
+@@ -0,0 +1,194 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvslt.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvslt_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvslt_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslt.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslt.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvslt.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvslt_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvslt_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslt.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslt.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvslt.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvslt_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvslt_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslt.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslt.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvslt.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvslt_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvslt_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslt.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslt.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvslti_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvslti_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslti.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvslti_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvslti_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslti.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvslti_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvslti_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslti.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvslti_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvslti_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslti.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvslt.bu(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvslt_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvslt_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslt.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslt.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvslt.hu(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvslt_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvslt_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslt.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslt.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvslt.wu(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvslt_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvslt_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslt.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslt.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvslt.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvslt_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvslt_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslt.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslt.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvslti_bu(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvslti_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslti.bu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvslti_hu(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvslti_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslti.hu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvslti_wu(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvslti_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslti.wu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvslti_du(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvslti_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvslti.du $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sra.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sra.ll
+new file mode 100644
+index 000000000000..a7498682559b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sra.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsra.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvsra_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsra_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsra.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsra.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsra.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvsra_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsra_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsra.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsra.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsra.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvsra_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsra_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsra.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsra.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsra.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvsra_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsra_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsra.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsra.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrai_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvsrai_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrai.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrai_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvsrai_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrai.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrai_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvsrai_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrai.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrai_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvsrai_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrai.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sran.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sran.ll
+new file mode 100644
+index 000000000000..f59ae4c19662
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sran.ll
+@@ -0,0 +1,38 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsran.b.h(<16 x i16>, <16 x i16>)
++
++define <32 x i8> @lasx_xvsran_b_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsran_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsran.b.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsran.b.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsran.h.w(<8 x i32>, <8 x i32>)
++
++define <16 x i16> @lasx_xvsran_h_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsran_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsran.h.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsran.h.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsran.w.d(<4 x i64>, <4 x i64>)
++
++define <8 x i32> @lasx_xvsran_w_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsran_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsran.w.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsran.w.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <8 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srani.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srani.ll
+new file mode 100644
+index 000000000000..91fb90da9c52
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srani.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrani_b_h(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrani_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrani.b.h $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> %va, <32 x i8> %vb, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrani_h_w(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrani_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrani.h.w $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> %va, <16 x i16> %vb, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrani_w_d(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrani_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrani.w.d $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> %va, <8 x i32> %vb, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrani_d_q(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrani_d_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrani.d.q $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> %va, <4 x i64> %vb, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srar.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srar.ll
+new file mode 100644
+index 000000000000..e2c160557c4d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srar.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrar.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvsrar_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrar_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrar.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrar.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrar.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvsrar_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrar_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrar.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrar.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrar.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvsrar_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrar_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrar.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrar.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrar.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvsrar_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrar_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrar.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrar.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrari_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvsrari_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrari.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrari_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvsrari_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrari.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrari_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvsrari_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrari.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrari_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvsrari_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrari.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srarn.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srarn.ll
+new file mode 100644
+index 000000000000..02dd989773ca
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srarn.ll
+@@ -0,0 +1,38 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrarn.b.h(<16 x i16>, <16 x i16>)
++
++define <32 x i8> @lasx_xvsrarn_b_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrarn_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrarn.b.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrarn.b.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrarn.h.w(<8 x i32>, <8 x i32>)
++
++define <16 x i16> @lasx_xvsrarn_h_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrarn_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrarn.h.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrarn.h.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrarn.w.d(<4 x i64>, <4 x i64>)
++
++define <8 x i32> @lasx_xvsrarn_w_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrarn_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrarn.w.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrarn.w.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <8 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srarni.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srarni.ll
+new file mode 100644
+index 000000000000..a7d2c3739793
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srarni.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrarni_b_h(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrarni_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrarni.b.h $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrarni_h_w(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrarni_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrarni.h.w $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrarni_w_d(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrarni_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrarni.w.d $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrarni_d_q(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrarni_d_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrarni.d.q $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srl.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srl.ll
+new file mode 100644
+index 000000000000..7b2992f2ca3b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srl.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrl.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvsrl_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrl_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrl.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrl.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrl.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvsrl_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrl_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrl.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrl.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrl.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvsrl_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrl_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrl.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrl.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrl.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvsrl_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrl_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrl.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrl.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrli_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvsrli_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrli.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrli_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvsrli_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrli.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrli_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvsrli_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrli.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrli_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvsrli_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrli.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srln.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srln.ll
+new file mode 100644
+index 000000000000..dc5c0e016ea0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srln.ll
+@@ -0,0 +1,38 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrln.b.h(<16 x i16>, <16 x i16>)
++
++define <32 x i8> @lasx_xvsrln_b_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrln_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrln.b.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrln.b.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrln.h.w(<8 x i32>, <8 x i32>)
++
++define <16 x i16> @lasx_xvsrln_h_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrln_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrln.h.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrln.h.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrln.w.d(<4 x i64>, <4 x i64>)
++
++define <8 x i32> @lasx_xvsrln_w_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrln_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrln.w.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrln.w.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <8 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlni.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlni.ll
+new file mode 100644
+index 000000000000..0301ebb195e2
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlni.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrlni_b_h(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrlni_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlni.b.h $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrlni_h_w(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrlni_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlni.h.w $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrlni_w_d(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrlni_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlni.w.d $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrlni_d_q(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrlni_d_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlni.d.q $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlr.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlr.ll
+new file mode 100644
+index 000000000000..e04504158e27
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlr.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrlr.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvsrlr_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrlr_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlr.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrlr.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrlr.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvsrlr_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrlr_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlr.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrlr.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrlr.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvsrlr_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrlr_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlr.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrlr.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrlr.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvsrlr_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrlr_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlr.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrlr.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrlri_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvsrlri_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlri.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrlri_h(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvsrlri_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlri.h $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrlri_w(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvsrlri_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlri.w $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrlri_d(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvsrlri_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlri.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlrn.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlrn.ll
+new file mode 100644
+index 000000000000..1e7df379c6e1
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlrn.ll
+@@ -0,0 +1,38 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrlrn.b.h(<16 x i16>, <16 x i16>)
++
++define <32 x i8> @lasx_xvsrlrn_b_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrlrn_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlrn.b.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrlrn.b.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrlrn.h.w(<8 x i32>, <8 x i32>)
++
++define <16 x i16> @lasx_xvsrlrn_h_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrlrn_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlrn.h.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrlrn.h.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrlrn.w.d(<4 x i64>, <4 x i64>)
++
++define <8 x i32> @lasx_xvsrlrn_w_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrlrn_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlrn.w.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrlrn.w.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <8 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlrni.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlrni.ll
+new file mode 100644
+index 000000000000..56dbafe8b1ac
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlrni.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrlrni_b_h(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrlrni_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlrni.b.h $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrlrni_h_w(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrlrni_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlrni.h.w $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrlrni_w_d(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrlrni_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlrni.w.d $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrlrni_d_q(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsrlrni_d_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsrlrni.d.q $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssran.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssran.ll
+new file mode 100644
+index 000000000000..da1857dad145
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssran.ll
+@@ -0,0 +1,74 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssran.b.h(<16 x i16>, <16 x i16>)
++
++define <32 x i8> @lasx_xvssran_b_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssran_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssran.b.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssran.b.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssran.h.w(<8 x i32>, <8 x i32>)
++
++define <16 x i16> @lasx_xvssran_h_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssran_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssran.h.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssran.h.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssran.w.d(<4 x i64>, <4 x i64>)
++
++define <8 x i32> @lasx_xvssran_w_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssran_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssran.w.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssran.w.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <8 x i32> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssran.bu.h(<16 x i16>, <16 x i16>)
++
++define <32 x i8> @lasx_xvssran_bu_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssran_bu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssran.bu.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssran.bu.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssran.hu.w(<8 x i32>, <8 x i32>)
++
++define <16 x i16> @lasx_xvssran_hu_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssran_hu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssran.hu.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssran.hu.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssran.wu.d(<4 x i64>, <4 x i64>)
++
++define <8 x i32> @lasx_xvssran_wu_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssran_wu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssran.wu.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssran.wu.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <8 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrani.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrani.ll
+new file mode 100644
+index 000000000000..9efa659b4a1e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrani.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrani_b_h(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrani_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrani.b.h $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> %va, <32 x i8> %vb, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrani_h_w(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrani_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrani.h.w $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> %va, <16 x i16> %vb, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrani_w_d(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrani_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrani.w.d $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> %va, <8 x i32> %vb, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrani_d_q(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrani_d_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrani.d.q $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> %va, <4 x i64> %vb, i32 1)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrani_bu_h(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrani_bu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrani.bu.h $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> %va, <32 x i8> %vb, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrani_hu_w(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrani_hu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrani.hu.w $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> %va, <16 x i16> %vb, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrani_wu_d(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrani_wu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrani.wu.d $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> %va, <8 x i32> %vb, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrani_du_q(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrani_du_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrani.du.q $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> %va, <4 x i64> %vb, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrarn.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrarn.ll
+new file mode 100644
+index 000000000000..b5d59ff06f4d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrarn.ll
+@@ -0,0 +1,74 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrarn.b.h(<16 x i16>, <16 x i16>)
++
++define <32 x i8> @lasx_xvssrarn_b_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrarn_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrarn.b.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrarn.b.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrarn.h.w(<8 x i32>, <8 x i32>)
++
++define <16 x i16> @lasx_xvssrarn_h_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrarn_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrarn.h.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrarn.h.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrarn.w.d(<4 x i64>, <4 x i64>)
++
++define <8 x i32> @lasx_xvssrarn_w_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrarn_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrarn.w.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrarn.w.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <8 x i32> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrarn.bu.h(<16 x i16>, <16 x i16>)
++
++define <32 x i8> @lasx_xvssrarn_bu_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrarn_bu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrarn.bu.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrarn.bu.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrarn.hu.w(<8 x i32>, <8 x i32>)
++
++define <16 x i16> @lasx_xvssrarn_hu_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrarn_hu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrarn.hu.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrarn.hu.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrarn.wu.d(<4 x i64>, <4 x i64>)
++
++define <8 x i32> @lasx_xvssrarn_wu_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrarn_wu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrarn.wu.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrarn.wu.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <8 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrarni.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrarni.ll
+new file mode 100644
+index 000000000000..da411dad645b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrarni.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrarni_b_h(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrarni_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrarni.b.h $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrarni_h_w(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrarni_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrarni.h.w $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrarni_w_d(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrarni_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrarni.w.d $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrarni_d_q(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrarni_d_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrarni.d.q $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 1)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrarni_bu_h(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrarni_bu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrarni.bu.h $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> %va, <32 x i8> %vb, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrarni_hu_w(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrarni_hu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrarni.hu.w $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> %va, <16 x i16> %vb, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrarni_wu_d(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrarni_wu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrarni.wu.d $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> %va, <8 x i32> %vb, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrarni_du_q(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrarni_du_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrarni.du.q $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> %va, <4 x i64> %vb, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrln.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrln.ll
+new file mode 100644
+index 000000000000..c60b5bdf81a0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrln.ll
+@@ -0,0 +1,74 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrln.b.h(<16 x i16>, <16 x i16>)
++
++define <32 x i8> @lasx_xvssrln_b_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrln_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrln.b.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrln.b.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrln.h.w(<8 x i32>, <8 x i32>)
++
++define <16 x i16> @lasx_xvssrln_h_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrln_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrln.h.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrln.h.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrln.w.d(<4 x i64>, <4 x i64>)
++
++define <8 x i32> @lasx_xvssrln_w_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrln_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrln.w.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrln.w.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <8 x i32> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrln.bu.h(<16 x i16>, <16 x i16>)
++
++define <32 x i8> @lasx_xvssrln_bu_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrln_bu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrln.bu.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrln.bu.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrln.hu.w(<8 x i32>, <8 x i32>)
++
++define <16 x i16> @lasx_xvssrln_hu_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrln_hu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrln.hu.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrln.hu.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrln.wu.d(<4 x i64>, <4 x i64>)
++
++define <8 x i32> @lasx_xvssrln_wu_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrln_wu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrln.wu.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrln.wu.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <8 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlni.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlni.ll
+new file mode 100644
+index 000000000000..e57dd426bde8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlni.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrlni_b_h(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlni_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlni.b.h $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrlni_h_w(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlni_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlni.h.w $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrlni_w_d(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlni_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlni.w.d $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrlni_d_q(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlni_d_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlni.d.q $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 1)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrlni_bu_h(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlni_bu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlni.bu.h $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> %va, <32 x i8> %vb, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrlni_hu_w(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlni_hu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlni.hu.w $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> %va, <16 x i16> %vb, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrlni_wu_d(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlni_wu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlni.wu.d $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> %va, <8 x i32> %vb, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrlni_du_q(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlni_du_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlni.du.q $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> %va, <4 x i64> %vb, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlrn.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlrn.ll
+new file mode 100644
+index 000000000000..774cf1bd5e84
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlrn.ll
+@@ -0,0 +1,74 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrlrn.b.h(<16 x i16>, <16 x i16>)
++
++define <32 x i8> @lasx_xvssrlrn_b_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlrn_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlrn.b.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.b.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrlrn.h.w(<8 x i32>, <8 x i32>)
++
++define <16 x i16> @lasx_xvssrlrn_h_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlrn_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlrn.h.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.h.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrlrn.w.d(<4 x i64>, <4 x i64>)
++
++define <8 x i32> @lasx_xvssrlrn_w_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlrn_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlrn.w.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.w.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <8 x i32> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrlrn.bu.h(<16 x i16>, <16 x i16>)
++
++define <32 x i8> @lasx_xvssrlrn_bu_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlrn_bu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlrn.bu.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.bu.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrlrn.hu.w(<8 x i32>, <8 x i32>)
++
++define <16 x i16> @lasx_xvssrlrn_hu_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlrn_hu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlrn.hu.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.hu.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrlrn.wu.d(<4 x i64>, <4 x i64>)
++
++define <8 x i32> @lasx_xvssrlrn_wu_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlrn_wu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlrn.wu.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.wu.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <8 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlrni.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlrni.ll
+new file mode 100644
+index 000000000000..9a80516d8d78
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlrni.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrlrni_b_h(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlrni_b_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlrni.b.h $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrlrni_h_w(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlrni_h_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlrni.h.w $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrlrni_w_d(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlrni_w_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlrni.w.d $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrlrni_d_q(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlrni_d_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlrni.d.q $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 1)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrlrni_bu_h(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlrni_bu_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlrni.bu.h $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> %va, <32 x i8> %vb, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrlrni_hu_w(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlrni_hu_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlrni.hu.w $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> %va, <16 x i16> %vb, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrlrni_wu_d(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlrni_wu_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlrni.wu.d $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> %va, <8 x i32> %vb, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrlrni_du_q(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssrlrni_du_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssrlrni.du.q $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> %va, <4 x i64> %vb, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssub.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssub.ll
+new file mode 100644
+index 000000000000..cd3ccd9f5262
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssub.ll
+@@ -0,0 +1,98 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssub.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvssub_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssub_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssub.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssub.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssub.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvssub_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssub_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssub.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssub.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssub.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvssub_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssub_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssub.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssub.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssub.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvssub_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssub_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssub.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssub.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssub.bu(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvssub_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssub_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssub.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssub.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssub.hu(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvssub_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssub_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssub.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssub.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssub.wu(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvssub_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssub_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssub.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssub.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssub.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvssub_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvssub_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvssub.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssub.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-st.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-st.ll
+new file mode 100644
+index 000000000000..b69e7b813f0c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-st.ll
+@@ -0,0 +1,27 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare void @llvm.loongarch.lasx.xvst(<32 x i8>, i8*, i32)
++
++define void @lasx_xvst(<32 x i8> %va, i8* %p) nounwind {
++; CHECK-LABEL: lasx_xvst:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvst $xr0, $a0, 1
++; CHECK-NEXT:    ret
++entry:
++  call void @llvm.loongarch.lasx.xvst(<32 x i8> %va, i8* %p, i32 1)
++  ret void
++}
++
++declare void @llvm.loongarch.lasx.xvstx(<32 x i8>, i8*, i64)
++
++define void @lasx_xvstx(<32 x i8> %va, i8* %p) nounwind {
++; CHECK-LABEL: lasx_xvstx:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    ori $a1, $zero, 1
++; CHECK-NEXT:    xvstx $xr0, $a0, $a1
++; CHECK-NEXT:    ret
++entry:
++  call void @llvm.loongarch.lasx.xvstx(<32 x i8> %va, i8* %p, i64 1)
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-stelm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-stelm.ll
+new file mode 100644
+index 000000000000..52ef3c471412
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-stelm.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare void @llvm.loongarch.lasx.xvstelm.b(<32 x i8>, i8*, i32, i32)
++
++define void @lasx_xvstelm_b(<32 x i8> %va, i8* %p) nounwind {
++; CHECK-LABEL: lasx_xvstelm_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvstelm.b $xr0, $a0, 1, 1
++; CHECK-NEXT:    ret
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> %va, i8* %p, i32 1, i32 1)
++  ret void
++}
++
++declare void @llvm.loongarch.lasx.xvstelm.h(<16 x i16>, i8*, i32, i32)
++
++define void @lasx_xvstelm_h(<16 x i16> %va, i8* %p) nounwind {
++; CHECK-LABEL: lasx_xvstelm_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvstelm.h $xr0, $a0, 2, 1
++; CHECK-NEXT:    ret
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> %va, i8* %p, i32 2, i32 1)
++  ret void
++}
++
++declare void @llvm.loongarch.lasx.xvstelm.w(<8 x i32>, i8*, i32, i32)
++
++define void @lasx_xvstelm_w(<8 x i32> %va, i8* %p) nounwind {
++; CHECK-LABEL: lasx_xvstelm_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvstelm.w $xr0, $a0, 4, 1
++; CHECK-NEXT:    ret
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> %va, i8* %p, i32 4, i32 1)
++  ret void
++}
++
++declare void @llvm.loongarch.lasx.xvstelm.d(<4 x i64>, i8*, i32, i32)
++
++define void @lasx_xvstelm_d(<4 x i64> %va, i8* %p) nounwind {
++; CHECK-LABEL: lasx_xvstelm_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvstelm.d $xr0, $a0, 8, 1
++; CHECK-NEXT:    ret
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> %va, i8* %p, i32 8, i32 1)
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sub.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sub.ll
+new file mode 100644
+index 000000000000..4d69dd83dcde
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sub.ll
+@@ -0,0 +1,62 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsub.b(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvsub_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsub_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsub.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsub.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsub.h(<16 x i16>, <16 x i16>)
++
++define <16 x i16> @lasx_xvsub_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsub_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsub.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsub.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsub.w(<8 x i32>, <8 x i32>)
++
++define <8 x i32> @lasx_xvsub_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsub_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsub.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsub.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsub.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvsub_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsub_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsub.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsub.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsub.q(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvsub_q(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsub_q:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsub.q $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsub.q(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-subi.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-subi.ll
+new file mode 100644
+index 000000000000..cc3235ff4657
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-subi.ll
+@@ -0,0 +1,50 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsubi_bu(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvsubi_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubi.bu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> %va, i32 1)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsubi_hu(<16 x i16> %va) nounwind {
++; CHECK-LABEL: lasx_xvsubi_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubi.hu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> %va, i32 1)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsubi_wu(<8 x i32> %va) nounwind {
++; CHECK-LABEL: lasx_xvsubi_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubi.wu $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> %va, i32 1)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsubi_du(<4 x i64> %va) nounwind {
++; CHECK-LABEL: lasx_xvsubi_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubi.du $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> %va, i32 1)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-subw.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-subw.ll
+new file mode 100644
+index 000000000000..6f203e894990
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-subw.ll
+@@ -0,0 +1,194 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.b(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvsubwev_h_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsubwev_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubwev.h.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.h(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvsubwev_w_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsubwev_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubwev.w.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.w(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvsubwev_d_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsubwev_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubwev.d.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvsubwev_q_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsubwev_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubwev.q.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.bu(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvsubwev_h_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsubwev_h_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubwev.h.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.hu(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvsubwev_w_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsubwev_w_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubwev.w.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.wu(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvsubwev_d_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsubwev_d_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubwev.d.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvsubwev_q_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsubwev_q_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubwev.q.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.b(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvsubwod_h_b(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsubwod_h_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubwod.h.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.b(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.h(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvsubwod_w_h(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsubwod_w_h:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubwod.w.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.h(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.w(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvsubwod_d_w(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsubwod_d_w:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubwod.d.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.w(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.d(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvsubwod_q_d(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsubwod_q_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubwod.q.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.d(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.bu(<32 x i8>, <32 x i8>)
++
++define <16 x i16> @lasx_xvsubwod_h_bu(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsubwod_h_bu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubwod.h.bu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.bu(<32 x i8> %va, <32 x i8> %vb)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.hu(<16 x i16>, <16 x i16>)
++
++define <8 x i32> @lasx_xvsubwod_w_hu(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsubwod_w_hu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubwod.w.hu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.hu(<16 x i16> %va, <16 x i16> %vb)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.wu(<8 x i32>, <8 x i32>)
++
++define <4 x i64> @lasx_xvsubwod_d_wu(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsubwod_d_wu:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubwod.d.wu $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.wu(<8 x i32> %va, <8 x i32> %vb)
++  ret <4 x i64> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.du(<4 x i64>, <4 x i64>)
++
++define <4 x i64> @lasx_xvsubwod_q_du(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK-LABEL: lasx_xvsubwod_q_du:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvsubwod.q.du $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.du(<4 x i64> %va, <4 x i64> %vb)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-xor.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-xor.ll
+new file mode 100644
+index 000000000000..6395b3d6f2e7
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-xor.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvxor.v(<32 x i8>, <32 x i8>)
++
++define <32 x i8> @lasx_xvxor_v(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK-LABEL: lasx_xvxor_v:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvxor.v(<32 x i8> %va, <32 x i8> %vb)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-xori.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-xori.ll
+new file mode 100644
+index 000000000000..c71d7e731165
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-xori.ll
+@@ -0,0 +1,14 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvxori_b(<32 x i8> %va) nounwind {
++; CHECK-LABEL: lasx_xvxori_b:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvxori.b $xr0, $xr0, 3
++; CHECK-NEXT:    ret
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> %va, i32 3)
++  ret <32 x i8> %res
++}
+-- 
+2.20.1
+
+
+From 45434adc9e68b15a6fc26f55659416ca2ef28ee3 Mon Sep 17 00:00:00 2001
+From: chenli <chenli@loongson.cn>
+Date: Sat, 19 Aug 2023 17:14:12 +0800
+Subject: [PATCH 06/35] [LoongArch] Add testcases of LASX intrinsics with
+ immediates
+
+The testcases mainly cover three situations:
+- the arguments which should be immediates are non immediates.
+- the immediate is out of upper limit of the argument type.
+- the immediate is out of lower limit of the argument type.
+
+Depends on D155830
+
+Reviewed By: SixWeining
+
+Differential Revision: https://reviews.llvm.org/D157571
+
+(cherry picked from commit 82bbf7003cabe2b6be8ab9b88bc96ecb8a64dc49)
+---
+ .../lasx/intrinsic-addi-invalid-imm.ll        |  65 +++++++++
+ .../LoongArch/lasx/intrinsic-addi-non-imm.ll  |  37 +++++
+ .../lasx/intrinsic-andi-invalid-imm.ll        |  17 +++
+ .../LoongArch/lasx/intrinsic-andi-non-imm.ll  |  10 ++
+ .../lasx/intrinsic-bitclr-invalid-imm.ll      |  65 +++++++++
+ .../lasx/intrinsic-bitclr-non-imm.ll          |  37 +++++
+ .../lasx/intrinsic-bitrev-invalid-imm.ll      |  65 +++++++++
+ .../lasx/intrinsic-bitrev-non-imm.ll          |  37 +++++
+ .../lasx/intrinsic-bitseli-invalid-imm.ll     |  17 +++
+ .../lasx/intrinsic-bitseli-non-imm.ll         |  10 ++
+ .../lasx/intrinsic-bitset-invalid-imm.ll      |  65 +++++++++
+ .../lasx/intrinsic-bitset-non-imm.ll          |  37 +++++
+ .../lasx/intrinsic-bsll-invalid-imm.ll        |  17 +++
+ .../LoongArch/lasx/intrinsic-bsll-non-imm.ll  |  10 ++
+ .../lasx/intrinsic-bsrl-invalid-imm.ll        |  17 +++
+ .../LoongArch/lasx/intrinsic-bsrl-non-imm.ll  |  10 ++
+ .../lasx/intrinsic-extrins-invalid-imm.ll     |  65 +++++++++
+ .../lasx/intrinsic-extrins-non-imm.ll         |  37 +++++
+ .../lasx/intrinsic-frstp-invalid-imm.ll       |  33 +++++
+ .../LoongArch/lasx/intrinsic-frstp-non-imm.ll |  19 +++
+ .../lasx/intrinsic-insgr2vr-invalid-imm.ll    |  33 +++++
+ .../lasx/intrinsic-insgr2vr-non-imm.ll        |  19 +++
+ .../lasx/intrinsic-insve0-invalid-imm.ll      |  33 +++++
+ .../lasx/intrinsic-insve0-non-imm.ll          |  19 +++
+ .../lasx/intrinsic-ld-invalid-imm.ll          |  17 +++
+ .../LoongArch/lasx/intrinsic-ld-non-imm.ll    |  10 ++
+ .../lasx/intrinsic-ldi-invalid-imm.ll         |  81 +++++++++++
+ .../LoongArch/lasx/intrinsic-ldi-non-imm.ll   |  46 +++++++
+ .../lasx/intrinsic-ldrepl-invalid-imm.ll      |  65 +++++++++
+ .../lasx/intrinsic-ldrepl-non-imm.ll          |  37 +++++
+ .../lasx/intrinsic-max-invalid-imm.ll         | 129 ++++++++++++++++++
+ .../LoongArch/lasx/intrinsic-max-non-imm.ll   |  73 ++++++++++
+ .../lasx/intrinsic-min-invalid-imm.ll         | 129 ++++++++++++++++++
+ .../LoongArch/lasx/intrinsic-min-non-imm.ll   |  73 ++++++++++
+ .../lasx/intrinsic-nori-invalid-imm.ll        |  17 +++
+ .../LoongArch/lasx/intrinsic-nori-non-imm.ll  |  10 ++
+ .../lasx/intrinsic-ori-invalid-imm.ll         |  17 +++
+ .../LoongArch/lasx/intrinsic-ori-non-imm.ll   |  10 ++
+ .../lasx/intrinsic-permi-invalid-imm.ll       |  49 +++++++
+ .../LoongArch/lasx/intrinsic-permi-non-imm.ll |  28 ++++
+ .../lasx/intrinsic-pickve-invalid-imm.ll      |  65 +++++++++
+ .../lasx/intrinsic-pickve-non-imm.ll          |  37 +++++
+ .../lasx/intrinsic-pickve2gr-invalid-imm.ll   |  65 +++++++++
+ .../lasx/intrinsic-pickve2gr-non-imm.ll       |  37 +++++
+ .../lasx/intrinsic-repl128vei-invalid-imm.ll  |  65 +++++++++
+ .../lasx/intrinsic-repl128vei-non-imm.ll      |  37 +++++
+ .../lasx/intrinsic-rotr-invalid-imm.ll        |  65 +++++++++
+ .../LoongArch/lasx/intrinsic-rotr-non-imm.ll  |  37 +++++
+ .../lasx/intrinsic-sat-invalid-imm.ll         | 129 ++++++++++++++++++
+ .../LoongArch/lasx/intrinsic-sat-non-imm.ll   |  73 ++++++++++
+ .../lasx/intrinsic-seq-invalid-imm.ll         |  65 +++++++++
+ .../LoongArch/lasx/intrinsic-seq-non-imm.ll   |  37 +++++
+ .../lasx/intrinsic-shuf4i-invalid-imm.ll      |  65 +++++++++
+ .../lasx/intrinsic-shuf4i-non-imm.ll          |  37 +++++
+ .../lasx/intrinsic-sle-invalid-imm.ll         | 129 ++++++++++++++++++
+ .../LoongArch/lasx/intrinsic-sle-non-imm.ll   |  73 ++++++++++
+ .../lasx/intrinsic-sll-invalid-imm.ll         |  65 +++++++++
+ .../LoongArch/lasx/intrinsic-sll-non-imm.ll   |  37 +++++
+ .../lasx/intrinsic-sllwil-invalid-imm.ll      |  97 +++++++++++++
+ .../lasx/intrinsic-sllwil-non-imm.ll          |  55 ++++++++
+ .../lasx/intrinsic-slt-invalid-imm.ll         | 129 ++++++++++++++++++
+ .../LoongArch/lasx/intrinsic-slt-non-imm.ll   |  73 ++++++++++
+ .../lasx/intrinsic-sra-invalid-imm.ll         |  65 +++++++++
+ .../LoongArch/lasx/intrinsic-sra-non-imm.ll   |  37 +++++
+ .../lasx/intrinsic-srani-invalid-imm.ll       |  65 +++++++++
+ .../LoongArch/lasx/intrinsic-srani-non-imm.ll |  37 +++++
+ .../lasx/intrinsic-srar-invalid-imm.ll        |  65 +++++++++
+ .../LoongArch/lasx/intrinsic-srar-non-imm.ll  |  37 +++++
+ .../lasx/intrinsic-srarni-invalid-imm.ll      |  65 +++++++++
+ .../lasx/intrinsic-srarni-non-imm.ll          |  37 +++++
+ .../lasx/intrinsic-srl-invalid-imm.ll         |  65 +++++++++
+ .../LoongArch/lasx/intrinsic-srl-non-imm.ll   |  37 +++++
+ .../lasx/intrinsic-srlni-invalid-imm.ll       |  65 +++++++++
+ .../LoongArch/lasx/intrinsic-srlni-non-imm.ll |  37 +++++
+ .../lasx/intrinsic-srlr-invalid-imm.ll        |  65 +++++++++
+ .../LoongArch/lasx/intrinsic-srlr-non-imm.ll  |  37 +++++
+ .../lasx/intrinsic-srlrni-invalid-imm.ll      |  65 +++++++++
+ .../lasx/intrinsic-srlrni-non-imm.ll          |  37 +++++
+ .../lasx/intrinsic-ssrani-invalid-imm.ll      | 129 ++++++++++++++++++
+ .../lasx/intrinsic-ssrani-non-imm.ll          |  73 ++++++++++
+ .../lasx/intrinsic-ssrarni-invalid-imm.ll     | 129 ++++++++++++++++++
+ .../lasx/intrinsic-ssrarni-non-imm.ll         |  73 ++++++++++
+ .../lasx/intrinsic-ssrlni-invalid-imm.ll      | 129 ++++++++++++++++++
+ .../lasx/intrinsic-ssrlni-non-imm.ll          |  73 ++++++++++
+ .../lasx/intrinsic-ssrlrni-invalid-imm.ll     | 129 ++++++++++++++++++
+ .../lasx/intrinsic-ssrlrni-non-imm.ll         |  73 ++++++++++
+ .../lasx/intrinsic-st-invalid-imm.ll          |  17 +++
+ .../LoongArch/lasx/intrinsic-st-non-imm.ll    |  10 ++
+ .../lasx/intrinsic-stelm-invalid-imm.ll       | 121 ++++++++++++++++
+ .../LoongArch/lasx/intrinsic-stelm-non-imm.ll |  65 +++++++++
+ .../lasx/intrinsic-subi-invalid-imm.ll        |  65 +++++++++
+ .../LoongArch/lasx/intrinsic-subi-non-imm.ll  |  37 +++++
+ .../lasx/intrinsic-xori-invalid-imm.ll        |  17 +++
+ .../LoongArch/lasx/intrinsic-xori-non-imm.ll  |  10 ++
+ 94 files changed, 5003 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-addi-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-addi-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-andi-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-andi-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitclr-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitclr-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitrev-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitrev-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitseli-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitseli-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitset-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitset-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsll-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsll-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsrl-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsrl-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-extrins-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-extrins-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-frstp-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-frstp-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-insgr2vr-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-insgr2vr-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-insve0-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-insve0-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ld-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ld-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldi-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldi-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldrepl-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldrepl-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-max-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-max-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-min-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-min-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-nori-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-nori-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ori-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ori-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl128vei-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl128vei-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-rotr-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-rotr-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sat-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sat-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-seq-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-seq-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-shuf4i-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-shuf4i-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sle-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sle-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sll-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sll-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sllwil-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sllwil-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-slt-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-slt-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sra-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-sra-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srani-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srani-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srar-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srar-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srarni-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srarni-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srl-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srl-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlni-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlni-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlr-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlr-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlrni-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlrni-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrani-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrani-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrarni-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrarni-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlni-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlni-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlrni-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlrni-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-st-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-st-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-stelm-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-stelm-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-subi-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-subi-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-xori-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-xori-non-imm.ll
+
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-addi-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-addi-invalid-imm.ll
+new file mode 100644
+index 000000000000..4998847f0910
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-addi-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvaddi_bu_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvaddi.bu: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvaddi_bu_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvaddi.bu: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> %va, i32 32)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvaddi_hu_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvaddi.hu: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvaddi_hu_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvaddi.hu: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> %va, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvaddi_wu_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvaddi.wu: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvaddi_wu_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvaddi.wu: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> %va, i32 32)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvaddi_du_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvaddi.du: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvaddi_du_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvaddi.du: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> %va, i32 32)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-addi-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-addi-non-imm.ll
+new file mode 100644
+index 000000000000..f25f0e61a28e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-addi-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvaddi_bu(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvaddi_hu(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvaddi_wu(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvaddi_du(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-andi-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-andi-invalid-imm.ll
+new file mode 100644
+index 000000000000..60f0b765f954
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-andi-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvandi_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvandi.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvandi_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvandi.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> %va, i32 256)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-andi-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-andi-non-imm.ll
+new file mode 100644
+index 000000000000..1273dc6b450b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-andi-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvandi_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitclr-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitclr-invalid-imm.ll
+new file mode 100644
+index 000000000000..ecc287e89bbc
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitclr-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbitclri_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitclri.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvbitclri_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitclri.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> %va, i32 8)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvbitclri_h_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitclri.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvbitclri_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitclri.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> %va, i32 16)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvbitclri_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitclri.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvbitclri_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitclri.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> %va, i32 32)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvbitclri_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitclri.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvbitclri_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitclri.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> %va, i32 64)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitclr-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitclr-non-imm.ll
+new file mode 100644
+index 000000000000..09da85411082
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitclr-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbitclri_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvbitclri_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvbitclri_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvbitclri_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitrev-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitrev-invalid-imm.ll
+new file mode 100644
+index 000000000000..dff0884fdd5a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitrev-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbitrevi_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitrevi.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvbitrevi_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitrevi.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> %va, i32 8)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvbitrevi_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitrevi.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvbitrevi_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitrevi.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> %va, i32 16)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvbitrevi_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitrevi.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvbitrevi_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitrevi.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> %va, i32 32)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvbitrevi_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitrevi.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvbitrevi_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitrevi.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> %va, i32 64)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitrev-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitrev-non-imm.ll
+new file mode 100644
+index 000000000000..e1aef1a82f0c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitrev-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbitrevi_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvbitrevi_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvbitrevi_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvbitrevi_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitseli-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitseli-invalid-imm.ll
+new file mode 100644
+index 000000000000..3f6fd44f842c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitseli-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbitseli_b_lo(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitseli.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> %va, <32 x i8> %vb, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvbitseli_b_hi(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitseli.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> %va, <32 x i8> %vb, i32 256)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitseli-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitseli-non-imm.ll
+new file mode 100644
+index 000000000000..40533ab96d86
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitseli-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbitseli_b(<32 x i8> %va, <32 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> %va, <32 x i8> %vb, i32 %c)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitset-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitset-invalid-imm.ll
+new file mode 100644
+index 000000000000..17a77ece7775
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitset-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbitseti_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitseti.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvbitseti_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitseti.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> %va, i32 8)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvbitseti_h_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitseti.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvbitseti_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitseti.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> %va, i32 16)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvbitseti_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitseti.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvbitseti_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitseti.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> %va, i32 32)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvbitseti_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitseti.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvbitseti_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbitseti.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> %va, i32 64)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitset-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitset-non-imm.ll
+new file mode 100644
+index 000000000000..613285804e0e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bitset-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbitseti_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvbitseti_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvbitseti_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvbitseti_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsll-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsll-invalid-imm.ll
+new file mode 100644
+index 000000000000..1da08a633bd2
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsll-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbsll_v_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbsll.v: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvbsll_v_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbsll.v: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> %va, i32 32)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsll-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsll-non-imm.ll
+new file mode 100644
+index 000000000000..e19a3232c179
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsll-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbsll_v(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsrl-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsrl-invalid-imm.ll
+new file mode 100644
+index 000000000000..5d2b63391e67
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsrl-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbsrl_v_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbsrl.v: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvbsrl_v_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvbsrl.v: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> %va, i32 32)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsrl-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsrl-non-imm.ll
+new file mode 100644
+index 000000000000..8dfd0ca579b8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-bsrl-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvbsrl_v(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-extrins-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-extrins-invalid-imm.ll
+new file mode 100644
+index 000000000000..1301b8a146eb
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-extrins-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvextrins_b_lo(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvextrins.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> %va, <32 x i8> %vb, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvextrins_b_hi(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvextrins.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> %va, <32 x i8> %vb, i32 256)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvextrins_h_lo(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvextrins.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> %va, <16 x i16> %vb, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvextrins_h_hi(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvextrins.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> %va, <16 x i16> %vb, i32 256)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvextrins_w_lo(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvextrins.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> %va, <8 x i32> %vb, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvextrins_w_hi(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvextrins.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> %va, <8 x i32> %vb, i32 256)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvextrins_d_lo(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvextrins.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> %va, <4 x i64> %vb, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvextrins_d_hi(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvextrins.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> %va, <4 x i64> %vb, i32 256)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-extrins-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-extrins-non-imm.ll
+new file mode 100644
+index 000000000000..bca8f8b3c778
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-extrins-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvextrins_b(<32 x i8> %va, <32 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> %va, <32 x i8> %vb, i32 %c)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvextrins_h(<16 x i16> %va, <16 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> %va, <16 x i16> %vb, i32 %c)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvextrins_w(<8 x i32> %va, <8 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> %va, <8 x i32> %vb, i32 %c)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvextrins_d(<4 x i64> %va, <4 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> %va, <4 x i64> %vb, i32 %c)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frstp-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frstp-invalid-imm.ll
+new file mode 100644
+index 000000000000..64b4632669d2
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frstp-invalid-imm.ll
+@@ -0,0 +1,33 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvfrstpi_b_lo(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvfrstpi.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> %va, <32 x i8> %vb, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvfrstpi_b_hi(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvfrstpi.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> %va, <32 x i8> %vb, i32 32)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvfrstpi_h_lo(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvfrstpi.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> %va, <16 x i16> %vb, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvfrstpi_h_hi(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvfrstpi.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> %va, <16 x i16> %vb, i32 32)
++  ret <16 x i16> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frstp-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frstp-non-imm.ll
+new file mode 100644
+index 000000000000..ca92cff9b2d1
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frstp-non-imm.ll
+@@ -0,0 +1,19 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvfrstpi_b(<32 x i8> %va, <32 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> %va, <32 x i8> %vb, i32 %c)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvfrstpi_h(<16 x i16> %va, <16 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> %va, <16 x i16> %vb, i32 %c)
++  ret <16 x i16> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insgr2vr-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insgr2vr-invalid-imm.ll
+new file mode 100644
+index 000000000000..4982f2c7d43a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insgr2vr-invalid-imm.ll
+@@ -0,0 +1,33 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32>, i32, i32)
++
++define <8 x i32> @lasx_xvinsgr2vr_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvinsgr2vr.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> %va, i32 1, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvinsgr2vr_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvinsgr2vr.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> %va, i32 1, i32 8)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64>, i64, i32)
++
++define <4 x i64> @lasx_xvinsgr2vr_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvinsgr2vr.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> %va, i64 1, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvinsgr2vr_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvinsgr2vr.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> %va, i64 1, i32 4)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insgr2vr-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insgr2vr-non-imm.ll
+new file mode 100644
+index 000000000000..3accabf6dbd9
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insgr2vr-non-imm.ll
+@@ -0,0 +1,19 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32>, i32, i32)
++
++define <8 x i32> @lasx_xvinsgr2vr_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> %va, i32 1, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64>, i64, i32)
++
++define <4 x i64> @lasx_xvinsgr2vr_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> %va, i64 1, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insve0-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insve0-invalid-imm.ll
+new file mode 100644
+index 000000000000..a54fa8515fba
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insve0-invalid-imm.ll
+@@ -0,0 +1,33 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvinsve0_w_lo(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvinsve0.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> %va, <8 x i32> %vb, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvinsve0_w_hi(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvinsve0.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> %va, <8 x i32> %vb, i32 8)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvinsve0_d_lo(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvinsve0.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> %va, <4 x i64> %vb, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvinsve0_d_hi(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvinsve0.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> %va, <4 x i64> %vb, i32 4)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insve0-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insve0-non-imm.ll
+new file mode 100644
+index 000000000000..53e59db11aa6
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-insve0-non-imm.ll
+@@ -0,0 +1,19 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvinsve0_w(<8 x i32> %va, <8 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> %va, <8 x i32> %vb, i32 %c)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvinsve0_d(<4 x i64> %va, <4 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> %va, <4 x i64> %vb, i32 %c)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ld-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ld-invalid-imm.ll
+new file mode 100644
+index 000000000000..20dd8a45d7f0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ld-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvld(i8*, i32)
++
++define <32 x i8> @lasx_xvld_lo(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvld: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvld(i8* %p, i32 -2049)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvld_hi(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvld: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvld(i8* %p, i32 2048)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ld-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ld-non-imm.ll
+new file mode 100644
+index 000000000000..b23436a44832
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ld-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvld(i8*, i32)
++
++define <32 x i8> @lasx_xvld(i8* %p, i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvld(i8* %p, i32 %a)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldi-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldi-invalid-imm.ll
+new file mode 100644
+index 000000000000..f3dd3650cf8a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldi-invalid-imm.ll
+@@ -0,0 +1,81 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <4 x i64> @llvm.loongarch.lasx.xvldi(i32)
++
++define <4 x i64> @lasx_xvldi_lo() nounwind {
++; CHECK: llvm.loongarch.lasx.xvldi: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvldi(i32 -4097)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvldi_hi() nounwind {
++; CHECK: llvm.loongarch.lasx.xvldi: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvldi(i32 4096)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvrepli.b(i32)
++
++define <32 x i8> @lasx_xvrepli_b_lo() nounwind {
++; CHECK: llvm.loongarch.lasx.xvrepli.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvrepli.b(i32 -513)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvrepli_b_hi() nounwind {
++; CHECK: llvm.loongarch.lasx.xvrepli.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvrepli.b(i32 512)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvrepli.h(i32)
++
++define <16 x i16> @lasx_xvrepli_h_lo() nounwind {
++; CHECK: llvm.loongarch.lasx.xvrepli.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvrepli.h(i32 -513)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvrepli_h_hi() nounwind {
++; CHECK: llvm.loongarch.lasx.xvrepli.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvrepli.h(i32 512)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvrepli.w(i32)
++
++define <8 x i32> @lasx_xvrepli_w_lo() nounwind {
++; CHECK: llvm.loongarch.lasx.xvrepli.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvrepli.w(i32 -513)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvrepli_w_hi() nounwind {
++; CHECK: llvm.loongarch.lasx.xvrepli.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvrepli.w(i32 512)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvrepli.d(i32)
++
++define <4 x i64> @lasx_xvrepli_d_lo() nounwind {
++; CHECK: llvm.loongarch.lasx.xvrepli.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvrepli.d(i32 -513)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvrepli_d_hi() nounwind {
++; CHECK: llvm.loongarch.lasx.xvrepli.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvrepli.d(i32 512)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldi-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldi-non-imm.ll
+new file mode 100644
+index 000000000000..6466818bf674
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldi-non-imm.ll
+@@ -0,0 +1,46 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <4 x i64> @llvm.loongarch.lasx.xvldi(i32)
++
++define <4 x i64> @lasx_xvldi(i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvldi(i32 %a)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvrepli.b(i32)
++
++define <32 x i8> @lasx_xvrepli_b(i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvrepli.b(i32 %a)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvrepli.h(i32)
++
++define <16 x i16> @lasx_xvrepli_h(i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvrepli.h(i32 %a)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvrepli.w(i32)
++
++define <8 x i32> @lasx_xvrepli_w(i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvrepli.w(i32 %a)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvrepli.d(i32)
++
++define <4 x i64> @lasx_xvrepli_d(i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvrepli.d(i32 %a)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldrepl-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldrepl-invalid-imm.ll
+new file mode 100644
+index 000000000000..cb62a839985a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldrepl-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(i8*, i32)
++
++define <32 x i8> @lasx_xvldrepl_b_lo(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvldrepl.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(i8* %p, i32 -2049)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvldrepl_b_hi(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvldrepl.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(i8* %p, i32 2048)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(i8*, i32)
++
++define <16 x i16> @lasx_xvldrepl_h_lo(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvldrepl.h: argument out of range or not a multiple of 2.
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(i8* %p, i32 -2050)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvldrepl_h_hi(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvldrepl.h: argument out of range or not a multiple of 2.
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(i8* %p, i32 2048)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(i8*, i32)
++
++define <8 x i32> @lasx_xvldrepl_w_lo(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvldrepl.w: argument out of range or not a multiple of 4.
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(i8* %p, i32 -2052)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvldrepl_w_hi(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvldrepl.w: argument out of range or not a multiple of 4.
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(i8* %p, i32 2048)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(i8*, i32)
++
++define <4 x i64> @lasx_xvldrepl_d_lo(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvldrepl.d: argument out of range or not a multiple of 8.
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(i8* %p, i32 -2056)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvldrepl_d_hi(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvldrepl.d: argument out of range or not a multiple of 8.
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(i8* %p, i32 2048)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldrepl-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldrepl-non-imm.ll
+new file mode 100644
+index 000000000000..075d663b0dd7
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ldrepl-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(i8*, i32)
++
++define <32 x i8> @lasx_xvldrepl_b(i8* %p, i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(i8* %p, i32 %a)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(i8*, i32)
++
++define <16 x i16> @lasx_xvldrepl_h(i8* %p, i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(i8* %p, i32 %a)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(i8*, i32)
++
++define <8 x i32> @lasx_xvldrepl_w(i8* %p, i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(i8* %p, i32 %a)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(i8*, i32)
++
++define <4 x i64> @lasx_xvldrepl_d(i8* %p, i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(i8* %p, i32 %a)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-max-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-max-invalid-imm.ll
+new file mode 100644
+index 000000000000..a671e9979b2f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-max-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvmaxi_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmaxi.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> %va, i32 -17)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvmaxi_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmaxi.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> %va, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvmaxi_h_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmaxi.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> %va, i32 -17)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvmaxi_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmaxi.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> %va, i32 16)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvmaxi_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmaxi.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> %va, i32 -17)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvmaxi_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmaxi.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> %va, i32 16)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvmaxi_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmaxi.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> %va, i32 -17)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvmaxi_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmaxi.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> %va, i32 16)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvmaxi_bu_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmaxi.bu: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvmaxi_bu_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmaxi.bu: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> %va, i32 32)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvmaxi_hu_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmaxi.hu: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvmaxi_hu_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmaxi.hu: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> %va, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvmaxi_wu_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmaxi.wu: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvmaxi_wu_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmaxi.wu: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> %va, i32 32)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvmaxi_du_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmaxi.du: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvmaxi_du_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmaxi.du: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> %va, i32 32)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-max-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-max-non-imm.ll
+new file mode 100644
+index 000000000000..b85798b53c92
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-max-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvmaxi_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvmaxi_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvmaxi_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvmaxi_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvmaxi_bu(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvmaxi_hu(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvmaxi_wu(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvmaxi_du(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-min-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-min-invalid-imm.ll
+new file mode 100644
+index 000000000000..5ed4104c295f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-min-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvmini_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmini.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> %va, i32 -17)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvmini_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmini.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> %va, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvmini_h_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmini.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> %va, i32 -17)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvmini_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmini.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> %va, i32 16)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvmini_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmini.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> %va, i32 -17)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvmini_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmini.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> %va, i32 16)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvmini_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmini.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> %va, i32 -17)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvmini_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmini.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> %va, i32 16)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvmini_bu_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmini.bu: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvmini_bu_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmini.bu: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> %va, i32 32)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvmini_hu_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmini.hu: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvmini_hu_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmini.hu: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> %va, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvmini_wu_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmini.wu: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvmini_wu_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmini.wu: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> %va, i32 32)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvmini_du_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmini.du: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvmini_du_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvmini.du: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> %va, i32 32)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-min-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-min-non-imm.ll
+new file mode 100644
+index 000000000000..b81931977aad
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-min-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvmini_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvmini_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvmini_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvmini_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvmini_bu(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvmini_hu(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvmini_wu(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvmini_du(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-nori-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-nori-invalid-imm.ll
+new file mode 100644
+index 000000000000..1130e094bf1f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-nori-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvnori_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvnori.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvnori_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvnori.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> %va, i32 256)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-nori-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-nori-non-imm.ll
+new file mode 100644
+index 000000000000..8f2333064d64
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-nori-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvnori_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ori-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ori-invalid-imm.ll
+new file mode 100644
+index 000000000000..90dec8e55f2d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ori-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvori_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvori.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvori_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvori.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> %va, i32 256)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ori-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ori-non-imm.ll
+new file mode 100644
+index 000000000000..ae6571d98f4a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ori-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvori_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi-invalid-imm.ll
+new file mode 100644
+index 000000000000..41f4856bd8f7
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi-invalid-imm.ll
+@@ -0,0 +1,49 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvpermi_w_lo(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpermi.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> %va, <8 x i32> %vb, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvpermi_w_hi(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpermi.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> %va, <8 x i32> %vb, i32 256)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvpermi_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpermi.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvpermi_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpermi.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> %va, i32 256)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvpermi_q_lo(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpermi.q: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> %va, <32 x i8> %vb, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvpermi_q_hi(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpermi.q: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> %va, <32 x i8> %vb, i32 256)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi-non-imm.ll
+new file mode 100644
+index 000000000000..afb335c5d6ca
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi-non-imm.ll
+@@ -0,0 +1,28 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvpermi_w(<8 x i32> %va, <8 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> %va, <8 x i32> %vb, i32 %c)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvpermi_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvpermi_q(<32 x i8> %va, <32 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> %va, <32 x i8> %vb, i32 %c)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve-invalid-imm.ll
+new file mode 100644
+index 000000000000..cfc6ec42874e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvpickve_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpickve.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvpickve_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpickve.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> %va, i32 8)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvpickve_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpickve.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvpickve_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpickve.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> %va, i32 4)
++  ret <4 x i64> %res
++}
++
++declare <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float>, i32)
++
++define <8 x float> @lasx_xvpickve_w_f_lo(<8 x float> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpickve.w.f: argument out of range
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> %va, i32 -1)
++  ret <8 x float> %res
++}
++
++define <8 x float> @lasx_xvpickve_w_f_hi(<8 x float> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpickve.w.f: argument out of range
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> %va, i32 8)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double>, i32)
++
++define <4 x double> @lasx_xvpickve_d_f_lo(<4 x double> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpickve.d.f: argument out of range
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> %va, i32 -1)
++  ret <4 x double> %res
++}
++
++define <4 x double> @lasx_xvpickve_d_f_hi(<4 x double> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpickve.d.f: argument out of range
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> %va, i32 4)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve-non-imm.ll
+new file mode 100644
+index 000000000000..be1f19a89737
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvpickve_w(<8 x i32> %va, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> %va, i32 %c)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvpickve_d(<4 x i64> %va, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> %va, i32 %c)
++  ret <4 x i64> %res
++}
++
++declare <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float>, i32)
++
++define <8 x float> @lasx_xvpickve_w_f(<8 x float> %va, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> %va, i32 %c)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double>, i32)
++
++define <4 x double> @lasx_xvpickve_d_f(<4 x double> %va, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> %va, i32 %c)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-invalid-imm.ll
+new file mode 100644
+index 000000000000..93056b272dfc
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32>, i32)
++
++define i32 @lasx_xvpickve2gr_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpickve2gr.w: argument out of range
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> %va, i32 -1)
++  ret i32 %res
++}
++
++define i32 @lasx_xvpickve2gr_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpickve2gr.w: argument out of range
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> %va, i32 8)
++  ret i32 %res
++}
++
++declare i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64>, i32)
++
++define i64 @lasx_xvpickve2gr_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpickve2gr.d: argument out of range
++entry:
++  %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> %va, i32 -1)
++  ret i64 %res
++}
++
++define i64 @lasx_xvpickve2gr_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpickve2gr.d: argument out of range
++entry:
++  %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> %va, i32 4)
++  ret i64 %res
++}
++
++declare i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32>, i32)
++
++define i32 @lasx_xvpickve2gr_wu_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpickve2gr.wu: argument out of range
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> %va, i32 -1)
++  ret i32 %res
++}
++
++define i32 @lasx_xvpickve2gr_wu_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpickve2gr.wu: argument out of range
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> %va, i32 8)
++  ret i32 %res
++}
++
++declare i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64>, i32)
++
++define i64 @lasx_xvpickve2gr_du_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpickve2gr.du: argument out of range
++entry:
++  %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> %va, i32 -1)
++  ret i64 %res
++}
++
++define i64 @lasx_xvpickve2gr_du_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvpickve2gr.du: argument out of range
++entry:
++  %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> %va, i32 4)
++  ret i64 %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-non-imm.ll
+new file mode 100644
+index 000000000000..0fa8c94adc60
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32>, i32)
++
++define i32 @lasx_xvpickve2gr_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> %va, i32 %b)
++  ret i32 %res
++}
++
++declare i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64>, i32)
++
++define i64 @lasx_xvpickve2gr_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> %va, i32 %b)
++  ret i64 %res
++}
++
++declare i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32>, i32)
++
++define i32 @lasx_xvpickve2gr_wu(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> %va, i32 %b)
++  ret i32 %res
++}
++
++declare i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64>, i32)
++
++define i64 @lasx_xvpickve2gr_du(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> %va, i32 %b)
++  ret i64 %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl128vei-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl128vei-invalid-imm.ll
+new file mode 100644
+index 000000000000..a0cb309c54e1
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl128vei-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvrepl128vei_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvrepl128vei.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvrepl128vei_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvrepl128vei.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> %va, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvrepl128vei_h_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvrepl128vei.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvrepl128vei_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvrepl128vei.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> %va, i32 8)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvrepl128vei_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvrepl128vei.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvrepl128vei_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvrepl128vei.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> %va, i32 4)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvrepl128vei_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvrepl128vei.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvrepl128vei_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvrepl128vei.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> %va, i32 2)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl128vei-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl128vei-non-imm.ll
+new file mode 100644
+index 000000000000..c537ffa66ba7
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl128vei-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvrepl128vei_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvrepl128vei_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvrepl128vei_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvrepl128vei_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-rotr-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-rotr-invalid-imm.ll
+new file mode 100644
+index 000000000000..40abdf497605
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-rotr-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvrotri_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvrotri.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvrotri_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvrotri.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> %va, i32 8)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvrotri_h_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvrotri.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvrotri_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvrotri.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> %va, i32 16)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvrotri_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvrotri.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvrotri_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvrotri.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> %va, i32 32)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvrotri_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvrotri.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvrotri_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvrotri.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> %va, i32 64)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-rotr-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-rotr-non-imm.ll
+new file mode 100644
+index 000000000000..dd38301d0534
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-rotr-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvrotri_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvrotri_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvrotri_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvrotri_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sat-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sat-invalid-imm.ll
+new file mode 100644
+index 000000000000..839fbc9990d3
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sat-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsat_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsat.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvsat_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsat.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> %va, i32 8)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsat_h_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsat.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvsat_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsat.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> %va, i32 16)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsat_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsat.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvsat_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsat.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> %va, i32 32)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsat_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsat.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvsat_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsat.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> %va, i32 64)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsat_bu_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsat.bu: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvsat_bu_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsat.bu: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> %va, i32 8)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsat_hu_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsat.hu: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvsat_hu_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsat.hu: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> %va, i32 16)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsat_wu_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsat.wu: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvsat_wu_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsat.wu: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> %va, i32 32)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsat_du_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsat.du: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvsat_du_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsat.du: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> %va, i32 64)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sat-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sat-non-imm.ll
+new file mode 100644
+index 000000000000..b73b32ebd3b0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sat-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsat_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsat_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsat_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsat_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsat_bu(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsat_hu(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsat_wu(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsat_du(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-seq-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-seq-invalid-imm.ll
+new file mode 100644
+index 000000000000..bb6ef0cc6574
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-seq-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvseqi_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvseqi.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> %va, i32 -17)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvseqi_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvseqi.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> %va, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvseqi_h_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvseqi.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> %va, i32 -17)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvseqi_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvseqi.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> %va, i32 16)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvseqi_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvseqi.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> %va, i32 -17)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvseqi_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvseqi.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> %va, i32 16)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvseqi_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvseqi.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> %va, i32 -17)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvseqi_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvseqi.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> %va, i32 16)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-seq-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-seq-non-imm.ll
+new file mode 100644
+index 000000000000..fb2c6206da7b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-seq-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvseqi_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvseqi_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvseqi_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvseqi_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-shuf4i-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-shuf4i-invalid-imm.ll
+new file mode 100644
+index 000000000000..9217d1f6a05d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-shuf4i-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvshuf4i_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvshuf4i.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvshuf4i_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvshuf4i.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> %va, i32 256)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvshuf4i_h_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvshuf4i.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvshuf4i_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvshuf4i.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> %va, i32 256)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvshuf4i_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvshuf4i.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvshuf4i_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvshuf4i.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> %va, i32 256)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvshuf4i_d_lo(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvshuf4i.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> %va, <4 x i64> %vb, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvshuf4i_d_hi(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvshuf4i.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> %va, <4 x i64> %vb, i32 256)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-shuf4i-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-shuf4i-non-imm.ll
+new file mode 100644
+index 000000000000..8d6d1c694193
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-shuf4i-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvshuf4i_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvshuf4i_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvshuf4i_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvshuf4i_d(<4 x i64> %va, <4 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> %va, <4 x i64> %vb, i32 %c)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sle-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sle-invalid-imm.ll
+new file mode 100644
+index 000000000000..5b10aca9801d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sle-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvslei_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslei.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> %va, i32 -17)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvslei_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslei.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> %va, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvslei_h_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslei.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> %va, i32 -17)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvslei_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslei.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> %va, i32 16)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvslei_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslei.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> %va, i32 -17)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvslei_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslei.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> %va, i32 16)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvslei_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslei.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> %va, i32 -17)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvslei_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslei.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> %va, i32 16)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvslei_bu_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslei.bu: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvslei_bu_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslei.bu: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> %va, i32 32)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvslei_hu_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslei.hu: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvslei_hu_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslei.hu: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> %va, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvslei_wu_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslei.wu: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvslei_wu_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslei.wu: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> %va, i32 32)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvslei_du_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslei.du: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvslei_du_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslei.du: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> %va, i32 32)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sle-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sle-non-imm.ll
+new file mode 100644
+index 000000000000..903bc10d88b7
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sle-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvslei_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvslei_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvslei_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvslei_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvslei_bu(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvslei_hu(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvslei_wu(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvslei_du(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sll-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sll-invalid-imm.ll
+new file mode 100644
+index 000000000000..bf8205376a6c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sll-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvslli_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslli.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvslli_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslli.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> %va, i32 8)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvslli_h_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslli.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvslli_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslli.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> %va, i32 16)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvslli_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslli.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvslli_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslli.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> %va, i32 32)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvslli_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslli.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvslli_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslli.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> %va, i32 64)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sll-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sll-non-imm.ll
+new file mode 100644
+index 000000000000..b5368a86b5c3
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sll-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvslli_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvslli_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvslli_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvslli_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sllwil-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sllwil-invalid-imm.ll
+new file mode 100644
+index 000000000000..18803767d6c0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sllwil-invalid-imm.ll
+@@ -0,0 +1,97 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8>, i32)
++
++define <16 x i16> @lasx_xvsllwil_h_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsllwil.h.b: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvsllwil_h_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsllwil.h.b: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> %va, i32 8)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16>, i32)
++
++define <8 x i32> @lasx_xvsllwil_w_h_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsllwil.w.h: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvsllwil_w_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsllwil.w.h: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> %va, i32 16)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32>, i32)
++
++define <4 x i64> @lasx_xvsllwil_d_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsllwil.d.w: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvsllwil_d_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsllwil.d.w: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> %va, i32 32)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8>, i32)
++
++define <16 x i16> @lasx_xvsllwil_hu_bu_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsllwil.hu.bu: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvsllwil_hu_bu_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsllwil.hu.bu: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> %va, i32 8)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16>, i32)
++
++define <8 x i32> @lasx_xvsllwil_wu_hu_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsllwil.wu.hu: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvsllwil_wu_hu_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsllwil.wu.hu: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> %va, i32 16)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32>, i32)
++
++define <4 x i64> @lasx_xvsllwil_du_wu_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsllwil.du.wu: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvsllwil_du_wu_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsllwil.du.wu: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> %va, i32 32)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sllwil-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sllwil-non-imm.ll
+new file mode 100644
+index 000000000000..3f5d4d631671
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sllwil-non-imm.ll
+@@ -0,0 +1,55 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8>, i32)
++
++define <16 x i16> @lasx_xvsllwil_h_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16>, i32)
++
++define <8 x i32> @lasx_xvsllwil_w_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32>, i32)
++
++define <4 x i64> @lasx_xvsllwil_d_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> %va, i32 %b)
++  ret <4 x i64> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8>, i32)
++
++define <16 x i16> @lasx_xvsllwil_hu_bu(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16>, i32)
++
++define <8 x i32> @lasx_xvsllwil_wu_hu(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32>, i32)
++
++define <4 x i64> @lasx_xvsllwil_du_wu(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-slt-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-slt-invalid-imm.ll
+new file mode 100644
+index 000000000000..dc0567da4e47
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-slt-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvslti_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslti.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> %va, i32 -17)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvslti_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslti.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> %va, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvslti_h_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslti.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> %va, i32 -17)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvslti_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslti.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> %va, i32 16)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvslti_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslti.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> %va, i32 -17)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvslti_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslti.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> %va, i32 16)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvslti_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslti.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> %va, i32 -17)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvslti_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslti.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> %va, i32 16)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvslti_bu_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslti.bu: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvslti_bu_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslti.bu: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> %va, i32 32)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvslti_hu_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslti.hu: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvslti_hu_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslti.hu: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> %va, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvslti_wu_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslti.wu: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvslti_wu_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslti.wu: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> %va, i32 32)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvslti_du_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslti.du: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvslti_du_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvslti.du: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> %va, i32 32)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-slt-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-slt-non-imm.ll
+new file mode 100644
+index 000000000000..a2cedc8d3ef3
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-slt-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvslti_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvslti_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvslti_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvslti_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvslti_bu(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvslti_hu(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvslti_wu(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvslti_du(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sra-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sra-invalid-imm.ll
+new file mode 100644
+index 000000000000..15b522d5e7e3
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sra-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrai_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrai.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvsrai_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrai.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> %va, i32 8)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrai_h_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrai.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvsrai_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrai.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> %va, i32 16)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrai_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrai.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvsrai_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrai.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> %va, i32 32)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrai_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrai.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvsrai_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrai.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> %va, i32 64)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sra-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sra-non-imm.ll
+new file mode 100644
+index 000000000000..fefee7246ae6
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-sra-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrai_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrai_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrai_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrai_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srani-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srani-invalid-imm.ll
+new file mode 100644
+index 000000000000..bedbfc4889d2
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srani-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrani_b_h_lo(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrani.b.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> %va, <32 x i8> %vb, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvsrani_b_h_hi(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrani.b.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> %va, <32 x i8> %vb, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrani_h_w_lo(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrani.h.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> %va, <16 x i16> %vb, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvsrani_h_w_hi(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrani.h.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> %va, <16 x i16> %vb, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrani_w_d_lo(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrani.w.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> %va, <8 x i32> %vb, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvsrani_w_d_hi(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrani.w.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> %va, <8 x i32> %vb, i32 64)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrani_d_q_lo(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrani.d.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> %va, <4 x i64> %vb, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvsrani_d_q_hi(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrani.d.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> %va, <4 x i64> %vb, i32 128)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srani-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srani-non-imm.ll
+new file mode 100644
+index 000000000000..3c17f2b6090a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srani-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrani_b_h(<32 x i8> %va, <32 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> %va, <32 x i8> %vb, i32 %c)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrani_h_w(<16 x i16> %va, <16 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> %va, <16 x i16> %vb, i32 %c)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrani_w_d(<8 x i32> %va, <8 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> %va, <8 x i32> %vb, i32 %c)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrani_d_q(<4 x i64> %va, <4 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> %va, <4 x i64> %vb, i32 %c)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srar-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srar-invalid-imm.ll
+new file mode 100644
+index 000000000000..e417e3cc5bbf
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srar-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrari_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrari.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvsrari_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrari.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> %va, i32 8)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrari_h_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrari.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvsrari_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrari.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> %va, i32 16)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrari_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrari.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvsrari_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrari.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> %va, i32 32)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrari_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrari.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvsrari_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrari.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> %va, i32 64)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srar-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srar-non-imm.ll
+new file mode 100644
+index 000000000000..15fed7966f1c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srar-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrari_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrari_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrari_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrari_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srarni-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srarni-invalid-imm.ll
+new file mode 100644
+index 000000000000..83e977827e2d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srarni-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrarni_b_h_lo(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrarni.b.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvsrarni_b_h_hi(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrarni.b.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrarni_h_w_lo(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrarni.h.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvsrarni_h_w_hi(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrarni.h.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrarni_w_d_lo(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrarni.w.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvsrarni_w_d_hi(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrarni.w.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 64)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrarni_d_q_lo(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrarni.d.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvsrarni_d_q_hi(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrarni.d.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 128)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srarni-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srarni-non-imm.ll
+new file mode 100644
+index 000000000000..eb577a29fb33
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srarni-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrarni_b_h(<32 x i8> %va, <32 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 %c)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrarni_h_w(<16 x i16> %va, <16 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 %c)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrarni_w_d(<8 x i32> %va, <8 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 %c)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrarni_d_q(<4 x i64> %va, <4 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 %c)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srl-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srl-invalid-imm.ll
+new file mode 100644
+index 000000000000..3ab02dcb97ed
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srl-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrli_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrli.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvsrli_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrli.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> %va, i32 8)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrli_h_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrli.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvsrli_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrli.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> %va, i32 16)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrli_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrli.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvsrli_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrli.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> %va, i32 32)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrli_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrli.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvsrli_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrli.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> %va, i32 64)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srl-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srl-non-imm.ll
+new file mode 100644
+index 000000000000..bc085aeaa232
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srl-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrli_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrli_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrli_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrli_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlni-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlni-invalid-imm.ll
+new file mode 100644
+index 000000000000..9e7c94305630
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlni-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrlni_b_h_lo(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlni.b.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvsrlni_b_h_hi(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlni.b.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrlni_h_w_lo(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlni.h.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvsrlni_h_w_hi(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlni.h.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrlni_w_d_lo(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlni.w.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvsrlni_w_d_hi(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlni.w.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 64)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrlni_d_q_lo(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlni.d.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvsrlni_d_q_hi(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlni.d.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 128)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlni-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlni-non-imm.ll
+new file mode 100644
+index 000000000000..66d800470003
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlni-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrlni_b_h(<32 x i8> %va, <32 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 %c)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrlni_h_w(<16 x i16> %va, <16 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 %c)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrlni_w_d(<8 x i32> %va, <8 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 %c)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrlni_d_q(<4 x i64> %va, <4 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 %c)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlr-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlr-invalid-imm.ll
+new file mode 100644
+index 000000000000..52621ddc6f49
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlr-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrlri_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlri.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvsrlri_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlri.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> %va, i32 8)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrlri_h_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlri.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvsrlri_h_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlri.h: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> %va, i32 16)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrlri_w_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlri.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvsrlri_w_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlri.w: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> %va, i32 32)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrlri_d_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlri.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvsrlri_d_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlri.d: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> %va, i32 64)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlr-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlr-non-imm.ll
+new file mode 100644
+index 000000000000..5663e3475b12
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlr-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrlri_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrlri_h(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrlri_w(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrlri_d(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlrni-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlrni-invalid-imm.ll
+new file mode 100644
+index 000000000000..2d65a75b175a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlrni-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrlrni_b_h_lo(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlrni.b.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvsrlrni_b_h_hi(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlrni.b.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrlrni_h_w_lo(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlrni.h.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvsrlrni_h_w_hi(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlrni.h.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrlrni_w_d_lo(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlrni.w.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvsrlrni_w_d_hi(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlrni.w.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 64)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrlrni_d_q_lo(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlrni.d.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvsrlrni_d_q_hi(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsrlrni.d.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 128)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlrni-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlrni-non-imm.ll
+new file mode 100644
+index 000000000000..82da0d21d013
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-srlrni-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsrlrni_b_h(<32 x i8> %va, <32 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 %c)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsrlrni_h_w(<16 x i16> %va, <16 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 %c)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsrlrni_w_d(<8 x i32> %va, <8 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 %c)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsrlrni_d_q(<4 x i64> %va, <4 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 %c)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrani-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrani-invalid-imm.ll
+new file mode 100644
+index 000000000000..e10d5d7bd488
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrani-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrani_b_h_lo(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrani.b.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> %va, <32 x i8> %vb, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvssrani_b_h_hi(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrani.b.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> %va, <32 x i8> %vb, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrani_h_w_lo(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrani.h.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> %va, <16 x i16> %vb, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvssrani_h_w_hi(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrani.h.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> %va, <16 x i16> %vb, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrani_w_d_lo(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrani.w.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> %va, <8 x i32> %vb, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvssrani_w_d_hi(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrani.w.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> %va, <8 x i32> %vb, i32 64)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrani_d_q_lo(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrani.d.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> %va, <4 x i64> %vb, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvssrani_d_q_hi(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrani.d.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> %va, <4 x i64> %vb, i32 128)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrani_bu_h_lo(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrani.bu.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> %va, <32 x i8> %vb, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvssrani_bu_h_hi(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrani.bu.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> %va, <32 x i8> %vb, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrani_hu_w_lo(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrani.hu.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> %va, <16 x i16> %vb, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvssrani_hu_w_hi(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrani.hu.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> %va, <16 x i16> %vb, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrani_wu_d_lo(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrani.wu.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> %va, <8 x i32> %vb, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvssrani_wu_d_hi(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrani.wu.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> %va, <8 x i32> %vb, i32 64)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrani_du_q_lo(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrani.du.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> %va, <4 x i64> %vb, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvssrani_du_q_hi(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrani.du.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> %va, <4 x i64> %vb, i32 128)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrani-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrani-non-imm.ll
+new file mode 100644
+index 000000000000..a928cc2de8c8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrani-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrani_b_h(<32 x i8> %va, <32 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> %va, <32 x i8> %vb, i32 %c)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrani_h_w(<16 x i16> %va, <16 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> %va, <16 x i16> %vb, i32 %c)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrani_w_d(<8 x i32> %va, <8 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> %va, <8 x i32> %vb, i32 %c)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrani_d_q(<4 x i64> %va, <4 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> %va, <4 x i64> %vb, i32 %c)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrani_bu_h(<32 x i8> %va, <32 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> %va, <32 x i8> %vb, i32 %c)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrani_hu_w(<16 x i16> %va, <16 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> %va, <16 x i16> %vb, i32 %c)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrani_wu_d(<8 x i32> %va, <8 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> %va, <8 x i32> %vb, i32 %c)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrani_du_q(<4 x i64> %va, <4 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> %va, <4 x i64> %vb, i32 %c)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrarni-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrarni-invalid-imm.ll
+new file mode 100644
+index 000000000000..42cd6ac99754
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrarni-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrarni_b_h_lo(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrarni.b.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvssrarni_b_h_hi(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrarni.b.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrarni_h_w_lo(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrarni.h.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvssrarni_h_w_hi(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrarni.h.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrarni_w_d_lo(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrarni.w.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvssrarni_w_d_hi(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrarni.w.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 64)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrarni_d_q_lo(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrarni.d.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvssrarni_d_q_hi(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrarni.d.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 128)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrarni_bu_h_lo(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrarni.bu.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> %va, <32 x i8> %vb, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvssrarni_bu_h_hi(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrarni.bu.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> %va, <32 x i8> %vb, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrarni_hu_w_lo(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrarni.hu.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> %va, <16 x i16> %vb, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvssrarni_hu_w_hi(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrarni.hu.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> %va, <16 x i16> %vb, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrarni_wu_d_lo(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrarni.wu.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> %va, <8 x i32> %vb, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvssrarni_wu_d_hi(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrarni.wu.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> %va, <8 x i32> %vb, i32 64)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrarni_du_q_lo(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrarni.du.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> %va, <4 x i64> %vb, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvssrarni_du_q_hi(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrarni.du.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> %va, <4 x i64> %vb, i32 128)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrarni-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrarni-non-imm.ll
+new file mode 100644
+index 000000000000..f050e7d79b0f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrarni-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrarni_b_h(<32 x i8> %va, <32 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 %c)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrarni_h_w(<16 x i16> %va, <16 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 %c)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrarni_w_d(<8 x i32> %va, <8 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 %c)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrarni_d_q(<4 x i64> %va, <4 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 %c)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrarni_bu_h(<32 x i8> %va, <32 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> %va, <32 x i8> %vb, i32 %c)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrarni_hu_w(<16 x i16> %va, <16 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> %va, <16 x i16> %vb, i32 %c)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrarni_wu_d(<8 x i32> %va, <8 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> %va, <8 x i32> %vb, i32 %c)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrarni_du_q(<4 x i64> %va, <4 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> %va, <4 x i64> %vb, i32 %c)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlni-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlni-invalid-imm.ll
+new file mode 100644
+index 000000000000..26be21a83aa4
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlni-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrlni_b_h_lo(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlni.b.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvssrlni_b_h_hi(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlni.b.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrlni_h_w_lo(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlni.h.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvssrlni_h_w_hi(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlni.h.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrlni_w_d_lo(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlni.w.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvssrlni_w_d_hi(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlni.w.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 64)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrlni_d_q_lo(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlni.d.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvssrlni_d_q_hi(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlni.d.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 128)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrlni_bu_h_lo(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlni.bu.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> %va, <32 x i8> %vb, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvssrlni_bu_h_hi(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlni.bu.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> %va, <32 x i8> %vb, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrlni_hu_w_lo(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlni.hu.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> %va, <16 x i16> %vb, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvssrlni_hu_w_hi(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlni.hu.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> %va, <16 x i16> %vb, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrlni_wu_d_lo(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlni.wu.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> %va, <8 x i32> %vb, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvssrlni_wu_d_hi(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlni.wu.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> %va, <8 x i32> %vb, i32 64)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrlni_du_q_lo(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlni.du.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> %va, <4 x i64> %vb, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvssrlni_du_q_hi(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlni.du.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> %va, <4 x i64> %vb, i32 128)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlni-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlni-non-imm.ll
+new file mode 100644
+index 000000000000..72da2a746dd5
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlni-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrlni_b_h(<32 x i8> %va, <32 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 %c)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrlni_h_w(<16 x i16> %va, <16 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 %c)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrlni_w_d(<8 x i32> %va, <8 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 %c)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrlni_d_q(<4 x i64> %va, <4 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 %c)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrlni_bu_h(<32 x i8> %va, <32 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> %va, <32 x i8> %vb, i32 %c)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrlni_hu_w(<16 x i16> %va, <16 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> %va, <16 x i16> %vb, i32 %c)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrlni_wu_d(<8 x i32> %va, <8 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> %va, <8 x i32> %vb, i32 %c)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrlni_du_q(<4 x i64> %va, <4 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> %va, <4 x i64> %vb, i32 %c)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlrni-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlrni-invalid-imm.ll
+new file mode 100644
+index 000000000000..cd778e2c0627
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlrni-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrlrni_b_h_lo(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlrni.b.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvssrlrni_b_h_hi(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlrni.b.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrlrni_h_w_lo(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlrni.h.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvssrlrni_h_w_hi(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlrni.h.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrlrni_w_d_lo(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlrni.w.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvssrlrni_w_d_hi(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlrni.w.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 64)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrlrni_d_q_lo(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlrni.d.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvssrlrni_d_q_hi(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlrni.d.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 128)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrlrni_bu_h_lo(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlrni.bu.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> %va, <32 x i8> %vb, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvssrlrni_bu_h_hi(<32 x i8> %va, <32 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlrni.bu.h: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> %va, <32 x i8> %vb, i32 16)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrlrni_hu_w_lo(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlrni.hu.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> %va, <16 x i16> %vb, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvssrlrni_hu_w_hi(<16 x i16> %va, <16 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlrni.hu.w: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> %va, <16 x i16> %vb, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrlrni_wu_d_lo(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlrni.wu.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> %va, <8 x i32> %vb, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvssrlrni_wu_d_hi(<8 x i32> %va, <8 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlrni.wu.d: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> %va, <8 x i32> %vb, i32 64)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrlrni_du_q_lo(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlrni.du.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> %va, <4 x i64> %vb, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvssrlrni_du_q_hi(<4 x i64> %va, <4 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lasx.xvssrlrni.du.q: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> %va, <4 x i64> %vb, i32 128)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlrni-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlrni-non-imm.ll
+new file mode 100644
+index 000000000000..a10c54329149
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-ssrlrni-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrlrni_b_h(<32 x i8> %va, <32 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> %va, <32 x i8> %vb, i32 %c)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrlrni_h_w(<16 x i16> %va, <16 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> %va, <16 x i16> %vb, i32 %c)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrlrni_w_d(<8 x i32> %va, <8 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> %va, <8 x i32> %vb, i32 %c)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrlrni_d_q(<4 x i64> %va, <4 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> %va, <4 x i64> %vb, i32 %c)
++  ret <4 x i64> %res
++}
++
++declare <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8>, <32 x i8>, i32)
++
++define <32 x i8> @lasx_xvssrlrni_bu_h(<32 x i8> %va, <32 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> %va, <32 x i8> %vb, i32 %c)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16>, <16 x i16>, i32)
++
++define <16 x i16> @lasx_xvssrlrni_hu_w(<16 x i16> %va, <16 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> %va, <16 x i16> %vb, i32 %c)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32>, <8 x i32>, i32)
++
++define <8 x i32> @lasx_xvssrlrni_wu_d(<8 x i32> %va, <8 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> %va, <8 x i32> %vb, i32 %c)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64>, <4 x i64>, i32)
++
++define <4 x i64> @lasx_xvssrlrni_du_q(<4 x i64> %va, <4 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> %va, <4 x i64> %vb, i32 %c)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-st-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-st-invalid-imm.ll
+new file mode 100644
+index 000000000000..0177f2b77b93
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-st-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare void @llvm.loongarch.lasx.xvst(<32 x i8>, i8*, i32)
++
++define void @lasx_xvst_lo(<32 x i8> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvst: argument out of range
++entry:
++  call void @llvm.loongarch.lasx.xvst(<32 x i8> %va, i8* %p, i32 -2049)
++  ret void
++}
++
++define void @lasx_xvst_hi(<32 x i8> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvst: argument out of range
++entry:
++  call void @llvm.loongarch.lasx.xvst(<32 x i8> %va, i8* %p, i32 2048)
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-st-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-st-non-imm.ll
+new file mode 100644
+index 000000000000..c19207aad6b8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-st-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare void @llvm.loongarch.lasx.xvst(<32 x i8>, i8*, i32)
++
++define void @lasx_xvst(<32 x i8> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lasx.xvst(<32 x i8> %va, i8* %p, i32 %b)
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-stelm-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-stelm-invalid-imm.ll
+new file mode 100644
+index 000000000000..0ea2484e090d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-stelm-invalid-imm.ll
+@@ -0,0 +1,121 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare void @llvm.loongarch.lasx.xvstelm.b(<32 x i8>, i8*, i32, i32)
++
++define void @lasx_xvstelm_b_lo(<32 x i8> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvstelm.b: argument out of range
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> %va, i8* %p, i32 -129, i32 1)
++  ret void
++}
++
++define void @lasx_xvstelm_b_hi(<32 x i8> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvstelm.b: argument out of range
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> %va, i8* %p, i32 128, i32 1)
++  ret void
++}
++
++define void @lasx_xvstelm_b_idx_lo(<32 x i8> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvstelm.b: argument out of range
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> %va, i8* %p, i32 1, i32 -1)
++  ret void
++}
++
++define void @lasx_xvstelm_b_idx_hi(<32 x i8> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvstelm.b: argument out of range
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> %va, i8* %p, i32 1, i32 32)
++  ret void
++}
++
++declare void @llvm.loongarch.lasx.xvstelm.h(<16 x i16>, i8*, i32, i32)
++
++define void @lasx_xvstelm_h_lo(<16 x i16> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvstelm.h: argument out of range or not a multiple of 2.
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> %va, i8* %p, i32 -258, i32 1)
++  ret void
++}
++
++define void @lasx_xvstelm_h_hi(<16 x i16> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvstelm.h: argument out of range or not a multiple of 2.
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> %va, i8* %p, i32 256, i32 1)
++  ret void
++}
++
++define void @lasx_xvstelm_h_idx_lo(<16 x i16> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvstelm.h: argument out of range or not a multiple of 2.
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> %va, i8* %p, i32 2, i32 -1)
++  ret void
++}
++
++define void @lasx_xvstelm_h_idx_hi(<16 x i16> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvstelm.h: argument out of range or not a multiple of 2.
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> %va, i8* %p, i32 2, i32 16)
++  ret void
++}
++
++declare void @llvm.loongarch.lasx.xvstelm.w(<8 x i32>, i8*, i32, i32)
++
++define void @lasx_xvstelm_w_lo(<8 x i32> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvstelm.w: argument out of range or not a multiple of 4.
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> %va, i8* %p, i32 -516, i32 1)
++  ret void
++}
++
++define void @lasx_xvstelm_w_hi(<8 x i32> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvstelm.w: argument out of range or not a multiple of 4.
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> %va, i8* %p, i32 512, i32 1)
++  ret void
++}
++
++define void @lasx_xvstelm_w_idx_lo(<8 x i32> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvstelm.w: argument out of range or not a multiple of 4.
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> %va, i8* %p, i32 4, i32 -1)
++  ret void
++}
++
++define void @lasx_xvstelm_w_idx_hi(<8 x i32> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvstelm.w: argument out of range or not a multiple of 4.
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> %va, i8* %p, i32 4, i32 8)
++  ret void
++}
++
++declare void @llvm.loongarch.lasx.xvstelm.d(<4 x i64>, i8*, i32, i32)
++
++define void @lasx_xvstelm_d_lo(<4 x i64> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvstelm.d: argument out of range or not a multiple of 8.
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> %va, i8* %p, i32 -1032, i32 1)
++  ret void
++}
++
++define void @lasx_xvstelm_d_hi(<4 x i64> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvstelm.d: argument out of range or not a multiple of 8.
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> %va, i8* %p, i32 1024, i32 1)
++  ret void
++}
++
++define void @lasx_xvstelm_d_idx_lo(<4 x i64> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvstelm.d: argument out of range or not a multiple of 8.
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> %va, i8* %p, i32 8, i32 -1)
++  ret void
++}
++
++define void @lasx_xvstelm_d_idx_hi(<4 x i64> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lasx.xvstelm.d: argument out of range or not a multiple of 8.
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> %va, i8* %p, i32 8, i32 4)
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-stelm-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-stelm-non-imm.ll
+new file mode 100644
+index 000000000000..42c7c0da1746
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-stelm-non-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare void @llvm.loongarch.lasx.xvstelm.b(<32 x i8>, i8*, i32, i32)
++
++define void @lasx_xvstelm_b(<32 x i8> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> %va, i8* %p, i32 %b, i32 1)
++  ret void
++}
++
++define void @lasx_xvstelm_b_idx(<32 x i8> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> %va, i8* %p, i32 1, i32 %b)
++  ret void
++}
++
++declare void @llvm.loongarch.lasx.xvstelm.h(<16 x i16>, i8*, i32, i32)
++
++define void @lasx_xvstelm_h(<16 x i16> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> %va, i8* %p, i32 %b, i32 1)
++  ret void
++}
++
++define void @lasx_xvstelm_h_idx(<16 x i16> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> %va, i8* %p, i32 2, i32 %b)
++  ret void
++}
++
++declare void @llvm.loongarch.lasx.xvstelm.w(<8 x i32>, i8*, i32, i32)
++
++define void @lasx_xvstelm_w(<8 x i32> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> %va, i8* %p, i32 %b, i32 1)
++  ret void
++}
++
++define void @lasx_xvstelm_w_idx(<8 x i32> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> %va, i8* %p, i32 4, i32 %b)
++  ret void
++}
++
++declare void @llvm.loongarch.lasx.xvstelm.d(<4 x i64>, i8*, i32, i32)
++
++define void @lasx_xvstelm_d(<4 x i64> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> %va, i8* %p, i32 %b, i32 1)
++  ret void
++}
++
++define void @lasx_xvstelm_d_idx(<4 x i64> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> %va, i8* %p, i32 8, i32 %b)
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-subi-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-subi-invalid-imm.ll
+new file mode 100644
+index 000000000000..810008c17f7e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-subi-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsubi_bu_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsubi.bu: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvsubi_bu_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsubi.bu: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> %va, i32 32)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsubi_hu_lo(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsubi.hu: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> %va, i32 -1)
++  ret <16 x i16> %res
++}
++
++define <16 x i16> @lasx_xvsubi_hu_hi(<16 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsubi.hu: argument out of range
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> %va, i32 32)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsubi_wu_lo(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsubi.wu: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> %va, i32 -1)
++  ret <8 x i32> %res
++}
++
++define <8 x i32> @lasx_xvsubi_wu_hi(<8 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsubi.wu: argument out of range
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> %va, i32 32)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsubi_du_lo(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsubi.du: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> %va, i32 -1)
++  ret <4 x i64> %res
++}
++
++define <4 x i64> @lasx_xvsubi_du_hi(<4 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvsubi.du: argument out of range
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> %va, i32 32)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-subi-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-subi-non-imm.ll
+new file mode 100644
+index 000000000000..924b89ce9d6c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-subi-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvsubi_bu(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
++
++declare <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16>, i32)
++
++define <16 x i16> @lasx_xvsubi_hu(<16 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> %va, i32 %b)
++  ret <16 x i16> %res
++}
++
++declare <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32>, i32)
++
++define <8 x i32> @lasx_xvsubi_wu(<8 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> %va, i32 %b)
++  ret <8 x i32> %res
++}
++
++declare <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64>, i32)
++
++define <4 x i64> @lasx_xvsubi_du(<4 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> %va, i32 %b)
++  ret <4 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-xori-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-xori-invalid-imm.ll
+new file mode 100644
+index 000000000000..0170d204cf42
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-xori-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvxori_b_lo(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvxori.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> %va, i32 -1)
++  ret <32 x i8> %res
++}
++
++define <32 x i8> @lasx_xvxori_b_hi(<32 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lasx.xvxori.b: argument out of range
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> %va, i32 256)
++  ret <32 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-xori-non-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-xori-non-imm.ll
+new file mode 100644
+index 000000000000..1478f691a1cc
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-xori-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s
++
++declare <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8>, i32)
++
++define <32 x i8> @lasx_xvxori_b(<32 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> %va, i32 %b)
++  ret <32 x i8> %res
++}
+-- 
+2.20.1
+
+
+From 7f172768f1132b99d4bacf4daf119a9154428b52 Mon Sep 17 00:00:00 2001
+From: chenli <chenli@loongson.cn>
+Date: Sat, 19 Aug 2023 17:15:19 +0800
+Subject: [PATCH 07/35] [LoongArch][MC] Add invalid immediate testcases for LSX
+ instructions
+
+Reviewed By: SixWeining
+
+Differential Revision: https://reviews.llvm.org/D157573
+
+(cherry picked from commit 2f4b6695836e16ec075061cd2508444bd403ad7d)
+---
+ llvm/test/MC/LoongArch/lsx/invalid-imm.s | 1149 +++++++++++++++++++++-
+ 1 file changed, 1143 insertions(+), 6 deletions(-)
+
+diff --git a/llvm/test/MC/LoongArch/lsx/invalid-imm.s b/llvm/test/MC/LoongArch/lsx/invalid-imm.s
+index fb7e24c83488..c3f9aaa08281 100644
+--- a/llvm/test/MC/LoongArch/lsx/invalid-imm.s
++++ b/llvm/test/MC/LoongArch/lsx/invalid-imm.s
+@@ -3,53 +3,1190 @@
+ # RUN: not llvm-mc --triple=loongarch64 %s 2>&1 | FileCheck %s
+ 
+ ## uimm1
++vstelm.d $vr0, $a0, 8, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 1]
++
++vstelm.d $vr0, $a0, 8, 2
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 1]
++
++vreplvei.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 1]
++
+ vreplvei.d $vr0, $vr1, 2
+ # CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 1]
+ 
++vpickve2gr.du $a0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 1]
++
++vpickve2gr.du $a0, $vr1, 2
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 1]
++
++vpickve2gr.d $a0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 1]
++
++vpickve2gr.d $a0, $vr1, 2
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 1]
++
++vinsgr2vr.d $vr0, $a0, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 1]
++
++vinsgr2vr.d $vr0, $a0, 2
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 1]
++
++## uimm2
++vstelm.w $vr0, $a0, 4, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 3]
++
++vstelm.w $vr0, $a0, 4, 4
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 3]
++
++vreplvei.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 3]
++
++vreplvei.w $vr0, $vr1, 4
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 3]
++
++vpickve2gr.wu $a0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 3]
++
++vpickve2gr.wu $a0, $vr1, 4
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 3]
++
++vpickve2gr.w $a0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 3]
++
++vpickve2gr.w $a0, $vr1, 4
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 3]
++
++vinsgr2vr.w $vr0, $a0, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 3]
++
++vinsgr2vr.w $vr0, $a0, 4
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 3]
++
++## uimm3
++vstelm.h $vr0, $a0, 2, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 7]
++
++vstelm.h $vr0, $a0, 2, 8
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 7]
++
++vreplvei.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 7]
++
++vreplvei.h $vr0, $vr1, 8
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 7]
++
++vpickve2gr.hu $a0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 7]
++
++vpickve2gr.hu $a0, $vr1, 8
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 7]
++
++vpickve2gr.h $a0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 7]
++
++vpickve2gr.h $a0, $vr1, 8
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 7]
++
++vinsgr2vr.h $vr0, $a0, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 7]
++
++vinsgr2vr.h $vr0, $a0, 8
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 7]
++
++vbitrevi.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 7]
++
++vbitrevi.b $vr0, $vr1, 8
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 7]
++
++vbitseti.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 7]
++
++vbitseti.b $vr0, $vr1, 8
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 7]
++
++vbitclri.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 7]
++
++vbitclri.b $vr0, $vr1, 8
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 7]
++
++vsrari.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 7]
++
++vsrari.b $vr0, $vr1, 8
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 7]
++
++vsrlri.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 7]
++
++vsrlri.b $vr0, $vr1, 8
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 7]
++
++vsllwil.hu.bu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 7]
++
++vsllwil.hu.bu $vr0, $vr1, 8
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 7]
++
++vsllwil.h.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 7]
++
++vsllwil.h.b $vr0, $vr1, 8
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 7]
++
++vrotri.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 7]
++
++vrotri.b $vr0, $vr1, 8
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 7]
++
++vsrai.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 7]
++
++vsrai.b $vr0, $vr1, 8
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 7]
++
++vsrli.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 7]
++
++vsrli.b $vr0, $vr1, 8
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 7]
++
++vslli.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 7]
++
++vslli.b $vr0, $vr1, 8
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 7]
++
++vsat.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:20: error: immediate must be an integer in the range [0, 7]
++
++vsat.b $vr0, $vr1, 8
++# CHECK: :[[#@LINE-1]]:20: error: immediate must be an integer in the range [0, 7]
++
++vsat.bu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 7]
++
++vsat.bu $vr0, $vr1, 8
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 7]
++
+ ## uimm4
++vstelm.b $vr0, $a0, 1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 15]
++
++vstelm.b $vr0, $a0, 1, 16
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 15]
++
++vreplvei.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 15]
++
++vreplvei.b $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 15]
++
++vpickve2gr.bu $a0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++vpickve2gr.bu $a0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++vpickve2gr.b $a0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++vpickve2gr.b $a0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++vinsgr2vr.b $vr0, $a0, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 15]
++
++vinsgr2vr.b $vr0, $a0, 16
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 15]
++
++vbitrevi.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 15]
++
++vbitrevi.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 15]
++
++vbitseti.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 15]
++
++vbitseti.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 15]
++
++vbitclri.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 15]
++
++vbitclri.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 15]
++
++vssrarni.bu.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 15]
++
++vssrarni.bu.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 15]
++
++vssrlrni.bu.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 15]
++
++vssrlrni.bu.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 15]
++
++vssrarni.b.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++vssrarni.b.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++vssrlrni.b.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++vssrlrni.b.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++vssrani.bu.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++vssrani.bu.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++vssrlni.bu.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++vssrlni.bu.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++vssrani.b.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++vssrani.b.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++vssrlni.b.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++vssrlni.b.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++vsrarni.b.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++vsrarni.b.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++vsrlrni.b.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++vsrlrni.b.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++vsrani.b.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 15]
++
++vsrani.b.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 15]
++
++vsrlni.b.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 15]
++
++vsrlni.b.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 15]
++
++vsrari.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 15]
++
++vsrari.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 15]
++
++vsrlri.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 15]
++
++vsrlri.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 15]
++
++vsllwil.wu.hu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 15]
++
++vsllwil.wu.hu $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 15]
++
++vsllwil.w.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++vsllwil.w.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++vrotri.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 15]
++
++vrotri.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 15]
++
++vsrai.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 15]
++
++vsrai.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 15]
++
++vsrli.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 15]
++
++vsrli.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 15]
++
++vslli.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 15]
++
++vslli.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 15]
++
++vsat.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:20: error: immediate must be an integer in the range [0, 15]
++
+ vsat.h $vr0, $vr1, 16
+ # CHECK: :[[#@LINE-1]]:20: error: immediate must be an integer in the range [0, 15]
+ 
++vsat.hu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 15]
++
++vsat.hu $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 15]
++
++## uimm5
++vbsrl.v $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 31]
++
++vbsrl.v $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 31]
++
++vbsll.v $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 31]
++
++vbsll.v $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 31]
++
++vslti.du $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vslti.du $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vslti.wu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vslti.wu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vslti.hu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vslti.hu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vslti.bu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vslti.bu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vslei.du $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vslei.du $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vslei.wu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vslei.wu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vslei.hu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vslei.hu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vslei.bu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vslei.bu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vfrstpi.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++vfrstpi.h $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++vfrstpi.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++vfrstpi.b $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++vbitrevi.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 31]
++
++vbitrevi.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 31]
++
++vbitseti.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 31]
++
++vbitseti.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 31]
++
++vbitclri.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 31]
++
++vbitclri.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 31]
++
++vssrarni.hu.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 31]
++
++vssrarni.hu.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 31]
++
++vssrlrni.hu.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 31]
++
++vssrlrni.hu.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 31]
++
++vssrarni.h.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++vssrarni.h.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++vssrlrni.h.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++vssrlrni.h.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++vssrani.hu.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++vssrani.hu.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++vssrlni.hu.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++vssrlni.hu.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++vssrani.h.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++vssrani.h.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++vssrlni.h.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++vssrlni.h.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++vsrarni.h.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++vsrarni.h.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++vsrlrni.h.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++vsrlrni.h.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++vsrani.h.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 31]
++
++vsrani.h.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 31]
++
++vsrlni.h.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 31]
++
++vsrlni.h.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 31]
++
++vsrari.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vsrari.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vsrlri.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vsrlri.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vsllwil.du.wu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 31]
++
++vsllwil.du.wu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 31]
++
++vsllwil.d.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++vsllwil.d.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++vrotri.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vrotri.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vsrai.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 31]
++
++vsrai.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 31]
++
++vsrli.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 31]
++
++vsrli.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 31]
++
++vslli.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 31]
++
++vslli.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 31]
++
++vaddi.bu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vaddi.bu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vaddi.hu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vaddi.hu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vaddi.wu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vaddi.wu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vaddi.du $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vaddi.du $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vsubi.bu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vsubi.bu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vsubi.hu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vsubi.hu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vsubi.wu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vsubi.wu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vsubi.du $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vsubi.du $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vmaxi.bu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vmaxi.bu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vmaxi.hu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vmaxi.hu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vmaxi.wu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vmaxi.wu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vmaxi.du $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vmaxi.du $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vmini.bu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vmini.bu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vmini.hu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vmini.hu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vmini.wu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vmini.wu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vmini.du $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vmini.du $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++vsat.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:20: error: immediate must be an integer in the range [0, 31]
++
++vsat.w $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:20: error: immediate must be an integer in the range [0, 31]
++
++vsat.wu $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 31]
++
++vsat.wu $vr0, $vr1, 32
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 31]
++
+ ## simm5
++vslti.d $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vslti.d $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vslti.w $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vslti.w $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vslti.h $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vslti.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vslti.b $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vslti.b $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vslei.d $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vslei.d $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vslei.w $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vslei.w $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vslei.h $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vslei.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vslei.b $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vslei.b $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vseqi.d $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vseqi.d $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vseqi.w $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vseqi.w $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vseqi.h $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vseqi.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vseqi.b $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
+ vseqi.b $vr0, $vr1, 16
+ # CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
+ 
++vmaxi.b $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vmaxi.b $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vmaxi.h $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vmaxi.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vmaxi.w $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vmaxi.w $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vmaxi.d $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vmaxi.d $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vmini.b $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vmini.b $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vmini.h $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vmini.h $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vmini.w $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vmini.w $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vmini.d $vr0, $vr1, -17
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++vmini.d $vr0, $vr1, 16
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-16, 15]
++
++## uimm6
++vbitrevi.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 63]
++
++vbitrevi.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 63]
++
++vbitseti.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 63]
++
++vbitseti.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 63]
++
++vbitclri.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 63]
++
++vbitclri.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 63]
++
++vssrarni.wu.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 63]
++
++vssrarni.wu.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 63]
++
++vssrlrni.wu.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 63]
++
++vssrlrni.wu.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 63]
++
++vssrarni.w.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 63]
++
++vssrarni.w.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 63]
++
++vssrlrni.w.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 63]
++
++vssrlrni.w.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 63]
++
++vssrani.wu.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 63]
++
++vssrani.wu.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 63]
++
++vssrlni.wu.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 63]
++
++vssrlni.wu.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 63]
++
++vssrani.w.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++vssrani.w.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++vssrlni.w.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++vssrlni.w.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++vsrarni.w.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++vsrarni.w.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++vsrlrni.w.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++vsrlrni.w.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++vsrani.w.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 63]
++
++vsrani.w.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 63]
++
++vsrlni.w.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 63]
++
++vsrlni.w.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 63]
++
++vsrari.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 63]
++
++vsrari.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 63]
++
++vsrlri.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 63]
++
++vsrlri.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 63]
++
++vrotri.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 63]
++
++vrotri.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 63]
++
++vsrai.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 63]
++
++vsrai.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 63]
++
++vsrli.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 63]
++
++vsrli.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 63]
++
++vslli.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 63]
++
++vslli.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 63]
++
++vsat.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:20: error: immediate must be an integer in the range [0, 63]
++
++vsat.d $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:20: error: immediate must be an integer in the range [0, 63]
++
++vsat.du $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 63]
++
++vsat.du $vr0, $vr1, 64
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 63]
++
+ ## uimm7
++vssrarni.du.q $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 127]
++
++vssrarni.du.q $vr0, $vr1, 128
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 127]
++
++vssrlrni.du.q $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 127]
++
++vssrlrni.du.q $vr0, $vr1, 128
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 127]
++
++vssrarni.d.q $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 127]
++
++vssrarni.d.q $vr0, $vr1, 128
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 127]
++
++vssrlrni.d.q $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 127]
++
++vssrlrni.d.q $vr0, $vr1, 128
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 127]
++
++vssrani.du.q $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 127]
++
++vssrani.du.q $vr0, $vr1, 128
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 127]
++
++vssrlni.du.q $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 127]
++
++vssrlni.du.q $vr0, $vr1, 128
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 127]
++
++vssrani.d.q $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 127]
++
++vssrani.d.q $vr0, $vr1, 128
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 127]
++
++vssrlni.d.q $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 127]
++
++vssrlni.d.q $vr0, $vr1, 128
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 127]
++
++vsrarni.d.q $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 127]
++
++vsrarni.d.q $vr0, $vr1, 128
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 127]
++
++vsrlrni.d.q $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 127]
++
++vsrlrni.d.q $vr0, $vr1, 128
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 127]
++
++vsrani.d.q $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 127]
++
++vsrani.d.q $vr0, $vr1, 128
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 127]
++
++vsrlni.d.q $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 127]
++
+ vsrlni.d.q $vr0, $vr1, 128
+ # CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 127]
+ 
+-## simm8
++## uimm8
++vextrins.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++vextrins.d $vr0, $vr1, 256
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++vextrins.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++vextrins.w $vr0, $vr1, 256
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++vextrins.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++vextrins.h $vr0, $vr1, 256
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++vextrins.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++vextrins.b $vr0, $vr1, 256
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++vpermi.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 255]
++
+ vpermi.w $vr0, $vr1, 256
+ # CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 255]
+ 
++vshuf4i.d $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 255]
++
++vshuf4i.d $vr0, $vr1, 256
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 255]
++
++vshuf4i.w $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 255]
++
++vshuf4i.w $vr0, $vr1, 256
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 255]
++
++vshuf4i.h $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 255]
++
++vshuf4i.h $vr0, $vr1, 256
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 255]
++
++vshuf4i.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 255]
++
++vshuf4i.b $vr0, $vr1, 256
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 255]
++
++vbitseli.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++vbitseli.b $vr0, $vr1, 256
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++vandi.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 255]
++
++vandi.b $vr0, $vr1, 256
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 255]
++
++vori.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:20: error: immediate must be an integer in the range [0, 255]
++
++vori.b $vr0, $vr1, 256
++# CHECK: :[[#@LINE-1]]:20: error: immediate must be an integer in the range [0, 255]
++
++vxori.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 255]
++
++vxori.b $vr0, $vr1, 256
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 255]
++
++vnori.b $vr0, $vr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 255]
++
++vnori.b $vr0, $vr1, 256
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 255]
++
++## simm8
++vstelm.b $vr0, $a0, -129, 1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-128, 127]
++
++vstelm.b $vr0, $a0, 128, 1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [-128, 127]
++
+ ## simm8_lsl1
+-vstelm.h $vr0, $a0, 255, 1
++vstelm.h $vr0, $a0, -258, 1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be a multiple of 2 in the range [-256, 254]
++
++vstelm.h $vr0, $a0, 256, 1
+ # CHECK: :[[#@LINE-1]]:21: error: immediate must be a multiple of 2 in the range [-256, 254]
+ 
+ ## simm8_lsl2
+-vstelm.w $vr0, $a0, 512, 1
++vstelm.w $vr0, $a0, -516, 1
+ # CHECK: :[[#@LINE-1]]:21: error: immediate must be a multiple of 4 in the range [-512, 508]
+ 
+-## simm10
+-vrepli.b $vr0, 512
+-# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-512, 511]
++vstelm.w $vr0, $a0, 512, 1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be a multiple of 4 in the range [-512, 508]
+ 
+ ## simm8_lsl3
++vstelm.d $vr0, $a0, -1032, 1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be a multiple of 8 in the range [-1024, 1016]
++
+ vstelm.d $vr0, $a0, 1024, 1
+ # CHECK: :[[#@LINE-1]]:21: error: immediate must be a multiple of 8 in the range [-1024, 1016]
+ 
+ ## simm9_lsl3
++vldrepl.d $vr0, $a0, -2056
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be a multiple of 8 in the range [-2048, 2040]
++
+ vldrepl.d $vr0, $a0, 2048
+ # CHECK: :[[#@LINE-1]]:22: error: immediate must be a multiple of 8 in the range [-2048, 2040]
+ 
+ ## simm10_lsl2
++vldrepl.w $vr0, $a0, -2052
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be a multiple of 4 in the range [-2048, 2044]
++
+ vldrepl.w $vr0, $a0, 2048
+ # CHECK: :[[#@LINE-1]]:22: error: immediate must be a multiple of 4 in the range [-2048, 2044]
+ 
++## simm10
++vrepli.b $vr0, -513
++# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-512, 511]
++
++vrepli.b $vr0, 512
++# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-512, 511]
++
++vrepli.h $vr0, -513
++# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-512, 511]
++
++vrepli.h $vr0, 512
++# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-512, 511]
++
++vrepli.w $vr0, -513
++# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-512, 511]
++
++vrepli.w $vr0, 512
++# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-512, 511]
++
++vrepli.d $vr0, -513
++# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-512, 511]
++
++vrepli.d $vr0, 512
++# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-512, 511]
++
+ ## simm11_lsl1
++vldrepl.h $vr0, $a0, -2050
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be a multiple of 2 in the range [-2048, 2046]
++
+ vldrepl.h $vr0, $a0, 2048
+ # CHECK: :[[#@LINE-1]]:22: error: immediate must be a multiple of 2 in the range [-2048, 2046]
+ 
++## simm12
++vldrepl.b $vr0, $a0, -2049
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-2048, 2047]
++
++vldrepl.b $vr0, $a0, 2048
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-2048, 2047]
++
++vst $vr0, $a0, -2049
++# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-2048, 2047]
++
++vst $vr0, $a0, 2048
++# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-2048, 2047]
++
++vld $vr0, $a0, -2049
++# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-2048, 2047]
++
++vld $vr0, $a0, 2048
++# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-2048, 2047]
++
+ ## simm13
++vldi $vr0, -4097
++# CHECK: :[[#@LINE-1]]:12: error: immediate must be an integer in the range [-4096, 4095]
++
+ vldi $vr0, 4096
+ # CHECK: :[[#@LINE-1]]:12: error: immediate must be an integer in the range [-4096, 4095]
+-- 
+2.20.1
+
+
+From aca10c260dfde07f2248a70e3d37770ee75e8e7a Mon Sep 17 00:00:00 2001
+From: chenli <chenli@loongson.cn>
+Date: Sat, 19 Aug 2023 17:16:09 +0800
+Subject: [PATCH 08/35] [LoongArch][MC] Add invalid immediate testcases for
+ LASX instructions
+
+Reviewed By: SixWeining
+
+Differential Revision: https://reviews.llvm.org/D157574
+
+(cherry picked from commit d163ae8c255f663707d4b0d5de03fcb18274b3eb)
+---
+ llvm/test/MC/LoongArch/lasx/invalid-imm.s | 1149 ++++++++++++++++++++-
+ 1 file changed, 1143 insertions(+), 6 deletions(-)
+
+diff --git a/llvm/test/MC/LoongArch/lasx/invalid-imm.s b/llvm/test/MC/LoongArch/lasx/invalid-imm.s
+index 5c61a7a42009..6f64a6f87802 100644
+--- a/llvm/test/MC/LoongArch/lasx/invalid-imm.s
++++ b/llvm/test/MC/LoongArch/lasx/invalid-imm.s
+@@ -3,53 +3,1190 @@
+ # RUN: not llvm-mc --triple=loongarch64 %s 2>&1 | FileCheck %s
+ 
+ ## uimm1
++xvrepl128vei.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 1]
++
+ xvrepl128vei.d $xr0, $xr1, 2
+ # CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 1]
+ 
++## uimm2
++xvpickve.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 3]
++
++xvpickve.d $xr0, $xr1, 4
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 3]
++
++xvinsve0.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 3]
++
++xvinsve0.d $xr0, $xr1, 4
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 3]
++
++xvinsgr2vr.d $xr0, $a0, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 3]
++
++xvinsgr2vr.d $xr0, $a0, 4
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 3]
++
++xvpickve2gr.d $a0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 3]
++
++xvpickve2gr.d $a0, $xr1, 4
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 3]
++
++xvpickve2gr.du $a0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 3]
++
++xvpickve2gr.du $a0, $xr1, 4
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 3]
++
++xvstelm.d $xr0, $a0, 8, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 3]
++
++xvstelm.d $xr0, $a0, 8, 4
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 3]
++
++xvrepl128vei.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 3]
++
++xvrepl128vei.w $xr0, $xr1, 4
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 3]
++
++## uimm3
++xvpickve.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 7]
++
++xvpickve.w $xr0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 7]
++
++xvinsve0.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 7]
++
++xvinsve0.w $xr0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 7]
++
++xvinsgr2vr.w $xr0, $a0, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 7]
++
++xvinsgr2vr.w $xr0, $a0, 8
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 7]
++
++xvpickve2gr.wu $a0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 7]
++
++xvpickve2gr.wu $a0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 7]
++
++xvpickve2gr.w $a0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 7]
++
++xvpickve2gr.w $a0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 7]
++
++xvstelm.w $xr0, $a0, 4, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 7]
++
++xvstelm.w $xr0, $a0, 4, 8
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 7]
++
++xvrepl128vei.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 7]
++
++xvrepl128vei.h $xr0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 7]
++
++xvbitrevi.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 7]
++
++xvbitrevi.b $xr0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 7]
++
++xvbitseti.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 7]
++
++xvbitseti.b $xr0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 7]
++
++xvbitclri.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 7]
++
++xvbitclri.b $xr0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 7]
++
++xvsrari.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 7]
++
++xvsrari.b $xr0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 7]
++
++xvsrlri.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 7]
++
++xvsrlri.b $xr0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 7]
++
++xvsllwil.hu.bu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 7]
++
++xvsllwil.hu.bu $xr0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 7]
++
++xvsllwil.h.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 7]
++
++xvsllwil.h.b $xr0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 7]
++
++xvrotri.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 7]
++
++xvrotri.b $xr0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 7]
++
++xvsrai.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 7]
++
++xvsrai.b $xr0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 7]
++
++xvsrli.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 7]
++
++xvsrli.b $xr0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 7]
++
++xvslli.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 7]
++
++xvslli.b $xr0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 7]
++
++xvsat.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 7]
++
++xvsat.b $xr0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 7]
++
++xvsat.bu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 7]
++
++xvsat.bu $xr0, $xr1, 8
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 7]
++
+ ## uimm4
++xvstelm.h $xr0, $a0, 2, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++xvstelm.h $xr0, $a0, 2, 16
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++xvrepl128vei.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 15]
++
++xvrepl128vei.b $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 15]
++
++xvbitrevi.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++xvbitrevi.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++xvbitseti.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++xvbitseti.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++xvbitclri.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++xvbitclri.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++xvssrarni.bu.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 15]
++
++xvssrarni.bu.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 15]
++
++xvssrlrni.bu.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 15]
++
++xvssrlrni.bu.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 15]
++
++xvssrarni.b.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 15]
++
++xvssrarni.b.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 15]
++
++xvssrlrni.b.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 15]
++
++xvssrlrni.b.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 15]
++
++xvssrani.bu.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 15]
++
++xvssrani.bu.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 15]
++
++xvssrlni.bu.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 15]
++
++xvssrlni.bu.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 15]
++
++xvssrani.b.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++xvssrani.b.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++xvssrlni.b.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++xvssrlni.b.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++xvsrarni.b.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++xvsrarni.b.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++xvsrlrni.b.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++xvsrlrni.b.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++xvsrani.b.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++xvsrani.b.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++xvsrlni.b.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++xvsrlni.b.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 15]
++
++xvsrari.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 15]
++
++xvsrari.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 15]
++
++xvsrlri.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 15]
++
++xvsrlri.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 15]
++
++xvsllwil.wu.hu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 15]
++
++xvsllwil.wu.hu $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 15]
++
++xvsllwil.w.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++xvsllwil.w.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 15]
++
++xvrotri.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 15]
++
++xvrotri.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 15]
++
++xvsrai.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 15]
++
++xvsrai.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 15]
++
++xvsrli.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 15]
++
++xvsrli.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 15]
++
++xvslli.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 15]
++
++xvslli.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 15]
++
++xvsat.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 15]
++
+ xvsat.h $xr0, $xr1, 16
+ # CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 15]
+ 
++xvsat.hu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 15]
++
++xvsat.hu $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 15]
++
++## uimm5
++xvstelm.b $xr0, $a0, 1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++xvstelm.b $xr0, $a0, 1, 32
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++xvbsrl.v $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++xvbsrl.v $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++xvbsll.v $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++xvbsll.v $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++xvslti.du $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvslti.du $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvslti.wu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvslti.wu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvslti.hu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvslti.hu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvslti.bu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvslti.bu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvslei.du $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvslei.du $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvslei.wu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvslei.wu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvslei.hu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvslei.hu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvslei.bu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvslei.bu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvfrstpi.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 31]
++
++xvfrstpi.h $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 31]
++
++xvfrstpi.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 31]
++
++xvfrstpi.b $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 31]
++
++xvbitrevi.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++xvbitrevi.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++xvbitseti.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++xvbitseti.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++xvbitclri.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++xvbitclri.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++xvssrarni.hu.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 31]
++
++xvssrarni.hu.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 31]
++
++xvssrlrni.hu.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 31]
++
++xvssrlrni.hu.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 31]
++
++xvssrarni.h.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 31]
++
++xvssrarni.h.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 31]
++
++xvssrlrni.h.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 31]
++
++xvssrlrni.h.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 31]
++
++xvssrani.hu.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 31]
++
++xvssrani.hu.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 31]
++
++xvssrlni.hu.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 31]
++
++xvssrlni.hu.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 31]
++
++xvssrani.h.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++xvssrani.h.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++xvssrlni.h.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++xvssrlni.h.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++xvsrarni.h.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++xvsrarni.h.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++xvsrlrni.h.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++xvsrlrni.h.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++xvsrani.h.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++xvsrani.h.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++xvsrlni.h.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++xvsrlni.h.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 31]
++
++xvsrari.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvsrari.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvsrlri.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvsrlri.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvsllwil.du.wu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 31]
++
++xvsllwil.du.wu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 31]
++
++xvsllwil.d.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++xvsllwil.d.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 31]
++
++xvrotri.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvrotri.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvsrai.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++xvsrai.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++xvsrli.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++xvsrli.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++xvslli.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++xvslli.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++xvaddi.bu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvaddi.bu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvaddi.hu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvaddi.hu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvaddi.wu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvaddi.wu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvaddi.du $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvaddi.du $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvsubi.bu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvsubi.bu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvsubi.hu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvsubi.hu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvsubi.wu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvsubi.wu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvsubi.du $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvsubi.du $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvmaxi.bu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvmaxi.bu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvmaxi.hu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvmaxi.hu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvmaxi.wu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvmaxi.wu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvmaxi.du $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvmaxi.du $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvmini.bu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvmini.bu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvmini.hu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvmini.hu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvmini.wu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvmini.wu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvmini.du $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvmini.du $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 31]
++
++xvsat.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 31]
++
++xvsat.w $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 31]
++
++xvsat.wu $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
++xvsat.wu $xr0, $xr1, 32
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 31]
++
+ ## simm5
++xvslti.d $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvslti.d $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvslti.w $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvslti.w $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvslti.h $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvslti.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvslti.b $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvslti.b $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvslei.d $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvslei.d $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvslei.w $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvslei.w $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvslei.h $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvslei.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvslei.b $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvslei.b $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvseqi.d $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvseqi.d $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvseqi.w $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvseqi.w $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvseqi.h $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvseqi.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvseqi.b $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
+ xvseqi.b $xr0, $xr1, 16
+ # CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
+ 
++xvmaxi.b $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvmaxi.b $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvmaxi.h $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvmaxi.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvmaxi.w $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvmaxi.w $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvmaxi.d $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvmaxi.d $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvmini.b $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvmini.b $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvmini.h $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvmini.h $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvmini.w $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvmini.w $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvmini.d $xr0, $xr1, -17
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++xvmini.d $xr0, $xr1, 16
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-16, 15]
++
++## uimm6
++xvbitrevi.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++xvbitrevi.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++xvbitseti.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++xvbitseti.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++xvbitclri.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++xvbitclri.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++xvssrarni.wu.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 63]
++
++xvssrarni.wu.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 63]
++
++xvssrlrni.wu.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 63]
++
++xvssrlrni.wu.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 63]
++
++xvssrarni.w.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 63]
++
++xvssrarni.w.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 63]
++
++xvssrlrni.w.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 63]
++
++xvssrlrni.w.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 63]
++
++xvssrani.wu.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 63]
++
++xvssrani.wu.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 63]
++
++xvssrlni.wu.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 63]
++
++xvssrlni.wu.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 63]
++
++xvssrani.w.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 63]
++
++xvssrani.w.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 63]
++
++xvssrlni.w.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 63]
++
++xvssrlni.w.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 63]
++
++xvsrarni.w.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 63]
++
++xvsrarni.w.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 63]
++
++xvsrlrni.w.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 63]
++
++xvsrlrni.w.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 63]
++
++xvsrani.w.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++xvsrani.w.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++xvsrlni.w.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++xvsrlni.w.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 63]
++
++xvsrari.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 63]
++
++xvsrari.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 63]
++
++xvsrlri.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 63]
++
++xvsrlri.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 63]
++
++xvrotri.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 63]
++
++xvrotri.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 63]
++
++xvsrai.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 63]
++
++xvsrai.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 63]
++
++xvsrli.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 63]
++
++xvsrli.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 63]
++
++xvslli.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 63]
++
++xvslli.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 63]
++
++xvsat.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 63]
++
++xvsat.d $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 63]
++
++xvsat.du $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 63]
++
++xvsat.du $xr0, $xr1, 64
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 63]
++
+ ## uimm7
++xvssrarni.du.q $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 127]
++
++xvssrarni.du.q $xr0, $xr1, 128
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 127]
++
++xvssrlrni.du.q $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 127]
++
++xvssrlrni.du.q $xr0, $xr1, 128
++# CHECK: :[[#@LINE-1]]:28: error: immediate must be an integer in the range [0, 127]
++
++xvssrarni.d.q $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 127]
++
++xvssrarni.d.q $xr0, $xr1, 128
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 127]
++
++xvssrlrni.d.q $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 127]
++
++xvssrlrni.d.q $xr0, $xr1, 128
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 127]
++
++xvssrani.du.q $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 127]
++
++xvssrani.du.q $xr0, $xr1, 128
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 127]
++
++xvssrlni.du.q $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 127]
++
++xvssrlni.du.q $xr0, $xr1, 128
++# CHECK: :[[#@LINE-1]]:27: error: immediate must be an integer in the range [0, 127]
++
++xvssrani.d.q $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 127]
++
++xvssrani.d.q $xr0, $xr1, 128
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 127]
++
++xvssrlni.d.q $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 127]
++
++xvssrlni.d.q $xr0, $xr1, 128
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 127]
++
++xvsrarni.d.q $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 127]
++
++xvsrarni.d.q $xr0, $xr1, 128
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 127]
++
++xvsrlrni.d.q $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 127]
++
++xvsrlrni.d.q $xr0, $xr1, 128
++# CHECK: :[[#@LINE-1]]:26: error: immediate must be an integer in the range [0, 127]
++
++xvsrani.d.q $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 127]
++
++xvsrani.d.q $xr0, $xr1, 128
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 127]
++
++xvsrlni.d.q $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 127]
++
+ xvsrlni.d.q $xr0, $xr1, 128
+ # CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 127]
+ 
+-## simm8
++## uimm8
++xvextrins.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 255]
++
++xvextrins.d $xr0, $xr1, 256
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 255]
++
++xvextrins.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 255]
++
++xvextrins.w $xr0, $xr1, 256
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 255]
++
++xvextrins.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 255]
++
++xvextrins.h $xr0, $xr1, 256
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 255]
++
++xvextrins.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 255]
++
++xvextrins.b $xr0, $xr1, 256
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 255]
++
++xvpermi.q $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 255]
++
++xvpermi.q $xr0, $xr1, 256
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 255]
++
++xvpermi.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 255]
++
++xvpermi.d $xr0, $xr1, 256
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 255]
++
++xvpermi.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 255]
++
+ xvpermi.w $xr0, $xr1, 256
+ # CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [0, 255]
+ 
++xvshuf4i.d $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++xvshuf4i.d $xr0, $xr1, 256
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++xvshuf4i.w $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++xvshuf4i.w $xr0, $xr1, 256
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++xvshuf4i.h $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++xvshuf4i.h $xr0, $xr1, 256
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++xvshuf4i.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++xvshuf4i.b $xr0, $xr1, 256
++# CHECK: :[[#@LINE-1]]:24: error: immediate must be an integer in the range [0, 255]
++
++xvbitseli.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 255]
++
++xvbitseli.b $xr0, $xr1, 256
++# CHECK: :[[#@LINE-1]]:25: error: immediate must be an integer in the range [0, 255]
++
++xvandi.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 255]
++
++xvandi.b $xr0, $xr1, 256
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 255]
++
++xvori.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 255]
++
++xvori.b $xr0, $xr1, 256
++# CHECK: :[[#@LINE-1]]:21: error: immediate must be an integer in the range [0, 255]
++
++xvxori.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 255]
++
++xvxori.b $xr0, $xr1, 256
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 255]
++
++xvnori.b $xr0, $xr1, -1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 255]
++
++xvnori.b $xr0, $xr1, 256
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [0, 255]
++
++## simm8
++xvstelm.b $xr0, $a0, -129, 1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-128, 127]
++
++xvstelm.b $xr0, $a0, 128, 1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be an integer in the range [-128, 127]
++
+ ## simm8_lsl1
+-xvstelm.h $xr0, $a0, 255, 1
++xvstelm.h $xr0, $a0, -258, 1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be a multiple of 2 in the range [-256, 254]
++
++xvstelm.h $xr0, $a0, 256, 1
+ # CHECK: :[[#@LINE-1]]:22: error: immediate must be a multiple of 2 in the range [-256, 254]
+ 
+ ## simm8_lsl2
+-xvstelm.w $xr0, $a0, 512, 1
++xvstelm.w $xr0, $a0, -516, 1
+ # CHECK: :[[#@LINE-1]]:22: error: immediate must be a multiple of 4 in the range [-512, 508]
+ 
+-## simm10
+-xvrepli.b $xr0, 512
+-# CHECK: :[[#@LINE-1]]:17: error: immediate must be an integer in the range [-512, 511]
++xvstelm.w $xr0, $a0, 512, 1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be a multiple of 4 in the range [-512, 508]
+ 
+ ## simm8_lsl3
++xvstelm.d $xr0, $a0, -1032, 1
++# CHECK: :[[#@LINE-1]]:22: error: immediate must be a multiple of 8 in the range [-1024, 1016]
++
+ xvstelm.d $xr0, $a0, 1024, 1
+ # CHECK: :[[#@LINE-1]]:22: error: immediate must be a multiple of 8 in the range [-1024, 1016]
+ 
+ ## simm9_lsl3
++xvldrepl.d $xr0, $a0, -2056
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be a multiple of 8 in the range [-2048, 2040]
++
+ xvldrepl.d $xr0, $a0, 2048
+ # CHECK: :[[#@LINE-1]]:23: error: immediate must be a multiple of 8 in the range [-2048, 2040]
+ 
+ ## simm10_lsl2
++xvldrepl.w $xr0, $a0, -2052
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be a multiple of 4 in the range [-2048, 2044]
++
+ xvldrepl.w $xr0, $a0, 2048
+ # CHECK: :[[#@LINE-1]]:23: error: immediate must be a multiple of 4 in the range [-2048, 2044]
+ 
++## simm10
++xvrepli.b $xr0, -513
++# CHECK: :[[#@LINE-1]]:17: error: immediate must be an integer in the range [-512, 511]
++
++xvrepli.b $xr0, 512
++# CHECK: :[[#@LINE-1]]:17: error: immediate must be an integer in the range [-512, 511]
++
++xvrepli.h $xr0, -513
++# CHECK: :[[#@LINE-1]]:17: error: immediate must be an integer in the range [-512, 511]
++
++xvrepli.h $xr0, 512
++# CHECK: :[[#@LINE-1]]:17: error: immediate must be an integer in the range [-512, 511]
++
++xvrepli.w $xr0, -513
++# CHECK: :[[#@LINE-1]]:17: error: immediate must be an integer in the range [-512, 511]
++
++xvrepli.w $xr0, 512
++# CHECK: :[[#@LINE-1]]:17: error: immediate must be an integer in the range [-512, 511]
++
++xvrepli.d $xr0, -513
++# CHECK: :[[#@LINE-1]]:17: error: immediate must be an integer in the range [-512, 511]
++
++xvrepli.d $xr0, 512
++# CHECK: :[[#@LINE-1]]:17: error: immediate must be an integer in the range [-512, 511]
++
+ ## simm11_lsl1
++xvldrepl.h $xr0, $a0, -2050
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be a multiple of 2 in the range [-2048, 2046]
++
+ xvldrepl.h $xr0, $a0, 2048
+ # CHECK: :[[#@LINE-1]]:23: error: immediate must be a multiple of 2 in the range [-2048, 2046]
+ 
++## simm12
++xvldrepl.b $xr0, $a0, -2049
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [-2048, 2047]
++
++xvldrepl.b $xr0, $a0, 2048
++# CHECK: :[[#@LINE-1]]:23: error: immediate must be an integer in the range [-2048, 2047]
++
++xvst $xr0, $a0, -2049
++# CHECK: :[[#@LINE-1]]:17: error: immediate must be an integer in the range [-2048, 2047]
++
++xvst $xr0, $a0, 2048
++# CHECK: :[[#@LINE-1]]:17: error: immediate must be an integer in the range [-2048, 2047]
++
++xvld $xr0, $a0, -2049
++# CHECK: :[[#@LINE-1]]:17: error: immediate must be an integer in the range [-2048, 2047]
++
++xvld $xr0, $a0, 2048
++# CHECK: :[[#@LINE-1]]:17: error: immediate must be an integer in the range [-2048, 2047]
++
+ ## simm13
++xvldi $xr0, -4097
++# CHECK: :[[#@LINE-1]]:13: error: immediate must be an integer in the range [-4096, 4095]
++
+ xvldi $xr0, 4096
+ # CHECK: :[[#@LINE-1]]:13: error: immediate must be an integer in the range [-4096, 4095]
+-- 
+2.20.1
+
+
+From 73373a6158629eb02ed9fe0e540c21ffb84a549f Mon Sep 17 00:00:00 2001
+From: chenli <chenli@loongson.cn>
+Date: Mon, 21 Aug 2023 11:03:49 +0800
+Subject: [PATCH 09/35] [LoongArch] Add testcases of LSX intrinsics with
+ immediates
+
+The testcases mainly cover three situations:
+- the arguments which should be immediates are non immediates.
+- the immediate is out of upper limit of the argument type.
+- the immediate is out of lower limit of the argument type.
+
+Depends on D155829
+
+Reviewed By: SixWeining
+
+Differential Revision: https://reviews.llvm.org/D157570
+
+(cherry picked from commit 0c76f46ca676ebecbdf2c9f7e8b05421a234bbed)
+---
+ .../lsx/intrinsic-addi-invalid-imm.ll         |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-addi-non-imm.ll   |  37 +++++
+ .../lsx/intrinsic-andi-invalid-imm.ll         |  17 +++
+ .../LoongArch/lsx/intrinsic-andi-non-imm.ll   |  10 ++
+ .../lsx/intrinsic-bitclr-invalid-imm.ll       |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-bitclr-non-imm.ll |  37 +++++
+ .../lsx/intrinsic-bitrev-invalid-imm.ll       |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-bitrev-non-imm.ll |  37 +++++
+ .../lsx/intrinsic-bitseli-invalid-imm.ll      |  17 +++
+ .../lsx/intrinsic-bitseli-non-imm.ll          |  10 ++
+ .../lsx/intrinsic-bitset-invalid-imm.ll       |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-bitset-non-imm.ll |  37 +++++
+ .../lsx/intrinsic-bsll-invalid-imm.ll         |  17 +++
+ .../LoongArch/lsx/intrinsic-bsll-non-imm.ll   |  10 ++
+ .../lsx/intrinsic-bsrl-invalid-imm.ll         |  17 +++
+ .../LoongArch/lsx/intrinsic-bsrl-non-imm.ll   |  10 ++
+ .../lsx/intrinsic-extrins-invalid-imm.ll      |  65 +++++++++
+ .../lsx/intrinsic-extrins-non-imm.ll          |  37 +++++
+ .../lsx/intrinsic-frstp-invalid-imm.ll        |  33 +++++
+ .../LoongArch/lsx/intrinsic-frstp-non-imm.ll  |  19 +++
+ .../lsx/intrinsic-insgr2vr-invalid-imm.ll     |  65 +++++++++
+ .../lsx/intrinsic-insgr2vr-non-imm.ll         |  37 +++++
+ .../LoongArch/lsx/intrinsic-ld-invalid-imm.ll |  17 +++
+ .../LoongArch/lsx/intrinsic-ld-non-imm.ll     |  10 ++
+ .../lsx/intrinsic-ldi-invalid-imm.ll          |  81 +++++++++++
+ .../LoongArch/lsx/intrinsic-ldi-non-imm.ll    |  46 +++++++
+ .../lsx/intrinsic-ldrepl-invalid-imm.ll       |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-ldrepl-non-imm.ll |  37 +++++
+ .../lsx/intrinsic-max-invalid-imm.ll          | 129 ++++++++++++++++++
+ .../LoongArch/lsx/intrinsic-max-non-imm.ll    |  73 ++++++++++
+ .../lsx/intrinsic-min-invalid-imm.ll          | 129 ++++++++++++++++++
+ .../LoongArch/lsx/intrinsic-min-non-imm.ll    |  73 ++++++++++
+ .../lsx/intrinsic-nori-invalid-imm.ll         |  17 +++
+ .../LoongArch/lsx/intrinsic-nori-non-imm.ll   |  10 ++
+ .../lsx/intrinsic-ori-invalid-imm.ll          |  17 +++
+ .../LoongArch/lsx/intrinsic-ori-non-imm.ll    |  10 ++
+ .../lsx/intrinsic-permi-invalid-imm.ll        |  17 +++
+ .../LoongArch/lsx/intrinsic-permi-non-imm.ll  |  10 ++
+ .../lsx/intrinsic-pickve2gr-invalid-imm.ll    | 129 ++++++++++++++++++
+ .../lsx/intrinsic-pickve2gr-non-imm.ll        |  73 ++++++++++
+ .../lsx/intrinsic-replvei-invalid-imm.ll      |  65 +++++++++
+ .../lsx/intrinsic-replvei-non-imm.ll          |  37 +++++
+ .../lsx/intrinsic-rotr-invalid-imm.ll         |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-rotr-non-imm.ll   |  37 +++++
+ .../lsx/intrinsic-sat-invalid-imm.ll          | 129 ++++++++++++++++++
+ .../LoongArch/lsx/intrinsic-sat-non-imm.ll    |  73 ++++++++++
+ .../lsx/intrinsic-seq-invalid-imm.ll          |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-seq-non-imm.ll    |  37 +++++
+ .../lsx/intrinsic-shuf4i-invalid-imm.ll       |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-shuf4i-non-imm.ll |  37 +++++
+ .../lsx/intrinsic-sle-invalid-imm.ll          | 129 ++++++++++++++++++
+ .../LoongArch/lsx/intrinsic-sle-non-imm.ll    |  73 ++++++++++
+ .../lsx/intrinsic-sll-invalid-imm.ll          |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-sll-non-imm.ll    |  37 +++++
+ .../lsx/intrinsic-sllwil-invalid-imm.ll       |  97 +++++++++++++
+ .../LoongArch/lsx/intrinsic-sllwil-non-imm.ll |  55 ++++++++
+ .../lsx/intrinsic-slt-invalid-imm.ll          | 129 ++++++++++++++++++
+ .../LoongArch/lsx/intrinsic-slt-non-imm.ll    |  73 ++++++++++
+ .../lsx/intrinsic-sra-invalid-imm.ll          |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-sra-non-imm.ll    |  37 +++++
+ .../lsx/intrinsic-srani-invalid-imm.ll        |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-srani-non-imm.ll  |  37 +++++
+ .../lsx/intrinsic-srar-invalid-imm.ll         |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-srar-non-imm.ll   |  37 +++++
+ .../lsx/intrinsic-srarni-invalid-imm.ll       |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-srarni-non-imm.ll |  37 +++++
+ .../lsx/intrinsic-srl-invalid-imm.ll          |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-srl-non-imm.ll    |  37 +++++
+ .../lsx/intrinsic-srlni-invalid-imm.ll        |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-srlni-non-imm.ll  |  37 +++++
+ .../lsx/intrinsic-srlr-invalid-imm.ll         |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-srlr-non-imm.ll   |  37 +++++
+ .../lsx/intrinsic-srlrni-invalid-imm.ll       |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-srlrni-non-imm.ll |  37 +++++
+ .../lsx/intrinsic-ssrani-invalid-imm.ll       | 129 ++++++++++++++++++
+ .../LoongArch/lsx/intrinsic-ssrani-non-imm.ll |  73 ++++++++++
+ .../lsx/intrinsic-ssrarni-invalid-imm.ll      | 129 ++++++++++++++++++
+ .../lsx/intrinsic-ssrarni-non-imm.ll          |  73 ++++++++++
+ .../lsx/intrinsic-ssrlni-invalid-imm.ll       | 129 ++++++++++++++++++
+ .../LoongArch/lsx/intrinsic-ssrlni-non-imm.ll |  73 ++++++++++
+ .../lsx/intrinsic-ssrlrni-invalid-imm.ll      | 129 ++++++++++++++++++
+ .../lsx/intrinsic-ssrlrni-non-imm.ll          |  73 ++++++++++
+ .../LoongArch/lsx/intrinsic-st-invalid-imm.ll |  17 +++
+ .../LoongArch/lsx/intrinsic-st-non-imm.ll     |  10 ++
+ .../lsx/intrinsic-stelm-invalid-imm.ll        | 121 ++++++++++++++++
+ .../LoongArch/lsx/intrinsic-stelm-non-imm.ll  |  65 +++++++++
+ .../lsx/intrinsic-subi-invalid-imm.ll         |  65 +++++++++
+ .../LoongArch/lsx/intrinsic-subi-non-imm.ll   |  37 +++++
+ .../lsx/intrinsic-xori-invalid-imm.ll         |  17 +++
+ .../LoongArch/lsx/intrinsic-xori-non-imm.ll   |  10 ++
+ 90 files changed, 4949 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-addi-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-addi-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-andi-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-andi-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitclr-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitclr-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitrev-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitrev-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitseli-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitseli-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitset-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitset-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsll-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsll-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsrl-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsrl-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-extrins-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-extrins-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-frstp-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-frstp-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-insgr2vr-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-insgr2vr-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ld-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ld-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldi-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldi-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldrepl-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldrepl-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-max-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-max-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-min-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-min-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-nori-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-nori-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ori-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ori-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-permi-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-permi-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-replvei-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-replvei-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-rotr-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-rotr-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sat-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sat-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-seq-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-seq-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-shuf4i-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-shuf4i-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sle-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sle-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sll-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sll-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sllwil-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sllwil-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-slt-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-slt-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sra-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-sra-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srani-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srani-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srar-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srar-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srarni-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srarni-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srl-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srl-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlni-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlni-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlr-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlr-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlrni-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlrni-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrani-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrani-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrarni-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrarni-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlni-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlni-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlrni-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlrni-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-st-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-st-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-stelm-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-stelm-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-subi-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-subi-non-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-xori-invalid-imm.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-xori-non-imm.ll
+
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-addi-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-addi-invalid-imm.ll
+new file mode 100644
+index 000000000000..6875872b6f83
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-addi-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vaddi.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vaddi_bu_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vaddi.bu: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vaddi.bu(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vaddi_bu_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vaddi.bu: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vaddi.bu(<16 x i8> %va, i32 32)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vaddi.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vaddi_hu_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vaddi.hu: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vaddi.hu(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vaddi_hu_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vaddi.hu: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vaddi.hu(<8 x i16> %va, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vaddi.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vaddi_wu_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vaddi.wu: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vaddi.wu(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vaddi_wu_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vaddi.wu: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vaddi.wu(<4 x i32> %va, i32 32)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vaddi.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vaddi_du_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vaddi.du: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vaddi.du(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vaddi_du_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vaddi.du: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vaddi.du(<2 x i64> %va, i32 32)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-addi-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-addi-non-imm.ll
+new file mode 100644
+index 000000000000..87d32b3ce02a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-addi-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vaddi.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vaddi_bu(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vaddi.bu(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vaddi.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vaddi_hu(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vaddi.hu(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vaddi.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vaddi_wu(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vaddi.wu(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vaddi.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vaddi_du(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vaddi.du(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-andi-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-andi-invalid-imm.ll
+new file mode 100644
+index 000000000000..82a117b2aba5
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-andi-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vandi.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vandi_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vandi.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vandi.b(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vandi_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vandi.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vandi.b(<16 x i8> %va, i32 256)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-andi-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-andi-non-imm.ll
+new file mode 100644
+index 000000000000..c0c35c775266
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-andi-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vandi.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vandi_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vandi.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitclr-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitclr-invalid-imm.ll
+new file mode 100644
+index 000000000000..b020806cd86c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitclr-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbitclri.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vbitclri_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitclri.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitclri.b(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vbitclri_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitclri.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitclri.b(<16 x i8> %va, i32 8)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vbitclri.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vbitclri_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitclri.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vbitclri.h(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vbitclri_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitclri.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vbitclri.h(<8 x i16> %va, i32 16)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vbitclri.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vbitclri_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitclri.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vbitclri.w(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vbitclri_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitclri.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vbitclri.w(<4 x i32> %va, i32 32)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vbitclri.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vbitclri_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitclri.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vbitclri.d(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vbitclri_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitclri.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vbitclri.d(<2 x i64> %va, i32 64)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitclr-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitclr-non-imm.ll
+new file mode 100644
+index 000000000000..df6cdb99cdbc
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitclr-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbitclri.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vbitclri_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitclri.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vbitclri.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vbitclri_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vbitclri.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vbitclri.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vbitclri_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vbitclri.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vbitclri.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vbitclri_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vbitclri.d(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitrev-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitrev-invalid-imm.ll
+new file mode 100644
+index 000000000000..24b6ec3284cb
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitrev-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbitrevi.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vbitrevi_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitrevi.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitrevi.b(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vbitrevi_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitrevi.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitrevi.b(<16 x i8> %va, i32 8)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vbitrevi.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vbitrevi_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitrevi.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vbitrevi.h(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vbitrevi_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitrevi.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vbitrevi.h(<8 x i16> %va, i32 16)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vbitrevi.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vbitrevi_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitrevi.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vbitrevi.w(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vbitrevi_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitrevi.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vbitrevi.w(<4 x i32> %va, i32 32)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vbitrevi.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vbitrevi_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitrevi.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vbitrevi.d(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vbitrevi_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitrevi.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vbitrevi.d(<2 x i64> %va, i32 64)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitrev-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitrev-non-imm.ll
+new file mode 100644
+index 000000000000..3ffb494c9907
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitrev-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbitrevi.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vbitrevi_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitrevi.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vbitrevi.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vbitrevi_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vbitrevi.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vbitrevi.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vbitrevi_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vbitrevi.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vbitrevi.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vbitrevi_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vbitrevi.d(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitseli-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitseli-invalid-imm.ll
+new file mode 100644
+index 000000000000..bc63b40e9fca
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitseli-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbitseli.b(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vbitseli_b_lo(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitseli.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitseli.b(<16 x i8> %va, <16 x i8> %vb, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vbitseli_b_hi(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitseli.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitseli.b(<16 x i8> %va, <16 x i8> %vb, i32 256)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitseli-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitseli-non-imm.ll
+new file mode 100644
+index 000000000000..52c1eb7d2024
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitseli-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbitseli.b(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vbitseli_b(<16 x i8> %va, <16 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitseli.b(<16 x i8> %va, <16 x i8> %vb, i32 %c)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitset-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitset-invalid-imm.ll
+new file mode 100644
+index 000000000000..e57e14d8cb07
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitset-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbitseti.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vbitseti_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitseti.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitseti.b(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vbitseti_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitseti.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitseti.b(<16 x i8> %va, i32 8)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vbitseti.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vbitseti_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitseti.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vbitseti.h(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vbitseti_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitseti.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vbitseti.h(<8 x i16> %va, i32 16)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vbitseti.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vbitseti_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitseti.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vbitseti.w(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vbitseti_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitseti.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vbitseti.w(<4 x i32> %va, i32 32)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vbitseti.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vbitseti_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitseti.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vbitseti.d(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vbitseti_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbitseti.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vbitseti.d(<2 x i64> %va, i32 64)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitset-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitset-non-imm.ll
+new file mode 100644
+index 000000000000..9b2bde015ed9
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bitset-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbitseti.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vbitseti_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbitseti.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vbitseti.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vbitseti_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vbitseti.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vbitseti.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vbitseti_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vbitseti.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vbitseti.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vbitseti_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vbitseti.d(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsll-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsll-invalid-imm.ll
+new file mode 100644
+index 000000000000..eb49af49c9be
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsll-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbsll.v(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vbsll_v_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbsll.v: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbsll.v(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vbsll_v_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbsll.v: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbsll.v(<16 x i8> %va, i32 32)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsll-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsll-non-imm.ll
+new file mode 100644
+index 000000000000..5b10c9e91a4f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsll-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbsll.v(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vbsll_v(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbsll.v(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsrl-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsrl-invalid-imm.ll
+new file mode 100644
+index 000000000000..bf56822e2ef5
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsrl-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbsrl.v(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vbsrl_v_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbsrl.v: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbsrl.v(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vbsrl_v_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vbsrl.v: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbsrl.v(<16 x i8> %va, i32 32)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsrl-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsrl-non-imm.ll
+new file mode 100644
+index 000000000000..0bc038c869ce
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-bsrl-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vbsrl.v(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vbsrl_v(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vbsrl.v(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-extrins-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-extrins-invalid-imm.ll
+new file mode 100644
+index 000000000000..7f94234ed603
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-extrins-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vextrins.b(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vextrins_b_lo(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vextrins.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vextrins.b(<16 x i8> %va, <16 x i8> %vb, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vextrins_b_hi(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vextrins.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vextrins.b(<16 x i8> %va, <16 x i8> %vb, i32 256)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vextrins.h(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vextrins_h_lo(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vextrins.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vextrins.h(<8 x i16> %va, <8 x i16> %vb, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vextrins_h_hi(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vextrins.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vextrins.h(<8 x i16> %va, <8 x i16> %vb, i32 256)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vextrins.w(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vextrins_w_lo(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vextrins.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vextrins.w(<4 x i32> %va, <4 x i32> %vb, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vextrins_w_hi(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vextrins.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vextrins.w(<4 x i32> %va, <4 x i32> %vb, i32 256)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vextrins.d(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vextrins_d_lo(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vextrins.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vextrins.d(<2 x i64> %va, <2 x i64> %vb, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vextrins_d_hi(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vextrins.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vextrins.d(<2 x i64> %va, <2 x i64> %vb, i32 256)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-extrins-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-extrins-non-imm.ll
+new file mode 100644
+index 000000000000..e834002bb60b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-extrins-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vextrins.b(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vextrins_b(<16 x i8> %va, <16 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vextrins.b(<16 x i8> %va, <16 x i8> %vb, i32 %c)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vextrins.h(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vextrins_h(<8 x i16> %va, <8 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vextrins.h(<8 x i16> %va, <8 x i16> %vb, i32 %c)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vextrins.w(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vextrins_w(<4 x i32> %va, <4 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vextrins.w(<4 x i32> %va, <4 x i32> %vb, i32 %c)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vextrins.d(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vextrins_d(<2 x i64> %va, <2 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vextrins.d(<2 x i64> %va, <2 x i64> %vb, i32 %c)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frstp-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frstp-invalid-imm.ll
+new file mode 100644
+index 000000000000..0184c855c9c1
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frstp-invalid-imm.ll
+@@ -0,0 +1,33 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vfrstpi.b(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vfrstpi_b_lo(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vfrstpi.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vfrstpi.b(<16 x i8> %va, <16 x i8> %vb, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vfrstpi_b_hi(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vfrstpi.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vfrstpi.b(<16 x i8> %va, <16 x i8> %vb, i32 32)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vfrstpi.h(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vfrstpi_h_lo(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vfrstpi.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vfrstpi.h(<8 x i16> %va, <8 x i16> %vb, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vfrstpi_h_hi(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vfrstpi.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vfrstpi.h(<8 x i16> %va, <8 x i16> %vb, i32 32)
++  ret <8 x i16> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frstp-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frstp-non-imm.ll
+new file mode 100644
+index 000000000000..9583f672a305
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frstp-non-imm.ll
+@@ -0,0 +1,19 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vfrstpi.b(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vfrstpi_b(<16 x i8> %va, <16 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vfrstpi.b(<16 x i8> %va, <16 x i8> %vb, i32 %c)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vfrstpi.h(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vfrstpi_h(<8 x i16> %va, <8 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vfrstpi.h(<8 x i16> %va, <8 x i16> %vb, i32 %c)
++  ret <8 x i16> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-insgr2vr-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-insgr2vr-invalid-imm.ll
+new file mode 100644
+index 000000000000..3d4f84fb6e03
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-insgr2vr-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8>, i32, i32)
++
++define <16 x i8> @lsx_vinsgr2vr_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vinsgr2vr.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8> %va, i32 1, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vinsgr2vr_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vinsgr2vr.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8> %va, i32 1, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16>, i32, i32)
++
++define <8 x i16> @lsx_vinsgr2vr_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vinsgr2vr.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16> %va, i32 1, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vinsgr2vr_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vinsgr2vr.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16> %va, i32 1, i32 8)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32>, i32, i32)
++
++define <4 x i32> @lsx_vinsgr2vr_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vinsgr2vr.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32> %va, i32 1, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vinsgr2vr_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vinsgr2vr.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32> %va, i32 1, i32 4)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64>, i64, i32)
++
++define <2 x i64> @lsx_vinsgr2vr_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vinsgr2vr.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> %va, i64 1, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vinsgr2vr_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vinsgr2vr.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> %va, i64 1, i32 2)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-insgr2vr-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-insgr2vr-non-imm.ll
+new file mode 100644
+index 000000000000..2a4c2218de8c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-insgr2vr-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8>, i32, i32)
++
++define <16 x i8> @lsx_vinsgr2vr_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8> %va, i32 1, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16>, i32, i32)
++
++define <8 x i16> @lsx_vinsgr2vr_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16> %va, i32 1, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32>, i32, i32)
++
++define <4 x i32> @lsx_vinsgr2vr_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32> %va, i32 1, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64>, i64, i32)
++
++define <2 x i64> @lsx_vinsgr2vr_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> %va, i64 1, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ld-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ld-invalid-imm.ll
+new file mode 100644
+index 000000000000..3aeb30ce66b4
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ld-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vld(i8*, i32)
++
++define <16 x i8> @lsx_vld_lo(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vld: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vld(i8* %p, i32 -2049)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vld_hi(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vld: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vld(i8* %p, i32 2048)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ld-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ld-non-imm.ll
+new file mode 100644
+index 000000000000..db6a0318d87a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ld-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vld(i8*, i32)
++
++define <16 x i8> @lsx_vld(i8* %p, i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vld(i8* %p, i32 %a)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldi-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldi-invalid-imm.ll
+new file mode 100644
+index 000000000000..57f6f8e81d91
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldi-invalid-imm.ll
+@@ -0,0 +1,81 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <2 x i64> @llvm.loongarch.lsx.vldi(i32)
++
++define <2 x i64> @lsx_vldi_lo() nounwind {
++; CHECK: llvm.loongarch.lsx.vldi: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vldi(i32 -4097)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vldi_hi() nounwind {
++; CHECK: llvm.loongarch.lsx.vldi: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vldi(i32 4096)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vrepli.b(i32)
++
++define <16 x i8> @lsx_vrepli_b_lo() nounwind {
++; CHECK: llvm.loongarch.lsx.vrepli.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vrepli.b(i32 -513)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vrepli_b_hi() nounwind {
++; CHECK: llvm.loongarch.lsx.vrepli.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vrepli.b(i32 512)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vrepli.h(i32)
++
++define <8 x i16> @lsx_vrepli_h_lo() nounwind {
++; CHECK: llvm.loongarch.lsx.vrepli.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vrepli.h(i32 -513)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vrepli_h_hi() nounwind {
++; CHECK: llvm.loongarch.lsx.vrepli.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vrepli.h(i32 512)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vrepli.w(i32)
++
++define <4 x i32> @lsx_vrepli_w_lo() nounwind {
++; CHECK: llvm.loongarch.lsx.vrepli.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vrepli.w(i32 -513)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vrepli_w_hi() nounwind {
++; CHECK: llvm.loongarch.lsx.vrepli.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vrepli.w(i32 512)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vrepli.d(i32)
++
++define <2 x i64> @lsx_vrepli_d_lo() nounwind {
++; CHECK: llvm.loongarch.lsx.vrepli.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vrepli.d(i32 -513)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vrepli_d_hi() nounwind {
++; CHECK: llvm.loongarch.lsx.vrepli.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vrepli.d(i32 512)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldi-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldi-non-imm.ll
+new file mode 100644
+index 000000000000..a8f8278f8097
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldi-non-imm.ll
+@@ -0,0 +1,46 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <2 x i64> @llvm.loongarch.lsx.vldi(i32)
++
++define <2 x i64> @lsx_vldi(i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vldi(i32 %a)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vrepli.b(i32)
++
++define <16 x i8> @lsx_vrepli_b(i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vrepli.b(i32 %a)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vrepli.h(i32)
++
++define <8 x i16> @lsx_vrepli_h(i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vrepli.h(i32 %a)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vrepli.w(i32)
++
++define <4 x i32> @lsx_vrepli_w(i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vrepli.w(i32 %a)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vrepli.d(i32)
++
++define <2 x i64> @lsx_vrepli_d(i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vrepli.d(i32 %a)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldrepl-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldrepl-invalid-imm.ll
+new file mode 100644
+index 000000000000..cb640e1245da
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldrepl-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vldrepl.b(i8*, i32)
++
++define <16 x i8> @lsx_vldrepl_b_lo(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vldrepl.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vldrepl.b(i8* %p, i32 -2049)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vldrepl_b_hi(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vldrepl.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vldrepl.b(i8* %p, i32 2048)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vldrepl.h(i8*, i32)
++
++define <8 x i16> @lsx_vldrepl_h_lo(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vldrepl.h: argument out of range or not a multiple of 2.
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vldrepl.h(i8* %p, i32 -2050)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vldrepl_h_hi(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vldrepl.h: argument out of range or not a multiple of 2.
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vldrepl.h(i8* %p, i32 2048)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vldrepl.w(i8*, i32)
++
++define <4 x i32> @lsx_vldrepl_w_lo(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vldrepl.w: argument out of range or not a multiple of 4.
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vldrepl.w(i8* %p, i32 -2052)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vldrepl_w_hi(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vldrepl.w: argument out of range or not a multiple of 4.
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vldrepl.w(i8* %p, i32 2048)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vldrepl.d(i8*, i32)
++
++define <2 x i64> @lsx_vldrepl_d_lo(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vldrepl.d: argument out of range or not a multiple of 8.
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vldrepl.d(i8* %p, i32 -2056)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vldrepl_d_hi(i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vldrepl.d: argument out of range or not a multiple of 8.
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vldrepl.d(i8* %p, i32 2048)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldrepl-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldrepl-non-imm.ll
+new file mode 100644
+index 000000000000..e60b21913c69
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ldrepl-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vldrepl.b(i8*, i32)
++
++define <16 x i8> @lsx_vldrepl_b(i8* %p, i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vldrepl.b(i8* %p, i32 %a)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vldrepl.h(i8*, i32)
++
++define <8 x i16> @lsx_vldrepl_h(i8* %p, i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vldrepl.h(i8* %p, i32 %a)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vldrepl.w(i8*, i32)
++
++define <4 x i32> @lsx_vldrepl_w(i8* %p, i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vldrepl.w(i8* %p, i32 %a)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vldrepl.d(i8*, i32)
++
++define <2 x i64> @lsx_vldrepl_d(i8* %p, i32 %a) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vldrepl.d(i8* %p, i32 %a)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-max-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-max-invalid-imm.ll
+new file mode 100644
+index 000000000000..667ba32723fc
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-max-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vmaxi_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmaxi.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8> %va, i32 -17)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vmaxi_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmaxi.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8> %va, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmaxi.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vmaxi_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmaxi.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmaxi.h(<8 x i16> %va, i32 -17)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vmaxi_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmaxi.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmaxi.h(<8 x i16> %va, i32 16)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmaxi.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vmaxi_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmaxi.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmaxi.w(<4 x i32> %va, i32 -17)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vmaxi_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmaxi.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmaxi.w(<4 x i32> %va, i32 16)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaxi.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vmaxi_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmaxi.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaxi.d(<2 x i64> %va, i32 -17)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vmaxi_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmaxi.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaxi.d(<2 x i64> %va, i32 16)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vmaxi.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vmaxi_bu_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmaxi.bu: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmaxi.bu(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vmaxi_bu_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmaxi.bu: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmaxi.bu(<16 x i8> %va, i32 32)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmaxi.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vmaxi_hu_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmaxi.hu: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmaxi.hu(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vmaxi_hu_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmaxi.hu: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmaxi.hu(<8 x i16> %va, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmaxi.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vmaxi_wu_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmaxi.wu: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmaxi.wu(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vmaxi_wu_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmaxi.wu: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmaxi.wu(<4 x i32> %va, i32 32)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaxi.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vmaxi_du_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmaxi.du: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaxi.du(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vmaxi_du_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmaxi.du: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaxi.du(<2 x i64> %va, i32 32)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-max-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-max-non-imm.ll
+new file mode 100644
+index 000000000000..34bbe3495670
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-max-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vmaxi_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmaxi.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vmaxi_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmaxi.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmaxi.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vmaxi_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmaxi.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaxi.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vmaxi_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaxi.d(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vmaxi.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vmaxi_bu(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmaxi.bu(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmaxi.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vmaxi_hu(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmaxi.hu(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmaxi.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vmaxi_wu(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmaxi.wu(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmaxi.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vmaxi_du(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmaxi.du(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-min-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-min-invalid-imm.ll
+new file mode 100644
+index 000000000000..b73bada4f06f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-min-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vmini_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmini.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8> %va, i32 -17)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vmini_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmini.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8> %va, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmini.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vmini_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmini.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmini.h(<8 x i16> %va, i32 -17)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vmini_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmini.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmini.h(<8 x i16> %va, i32 16)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmini.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vmini_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmini.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmini.w(<4 x i32> %va, i32 -17)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vmini_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmini.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmini.w(<4 x i32> %va, i32 16)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmini.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vmini_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmini.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmini.d(<2 x i64> %va, i32 -17)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vmini_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmini.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmini.d(<2 x i64> %va, i32 16)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vmini.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vmini_bu_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmini.bu: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmini.bu(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vmini_bu_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmini.bu: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmini.bu(<16 x i8> %va, i32 32)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmini.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vmini_hu_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmini.hu: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmini.hu(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vmini_hu_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmini.hu: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmini.hu(<8 x i16> %va, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmini.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vmini_wu_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmini.wu: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmini.wu(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vmini_wu_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmini.wu: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmini.wu(<4 x i32> %va, i32 32)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmini.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vmini_du_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmini.du: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmini.du(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vmini_du_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vmini.du: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmini.du(<2 x i64> %va, i32 32)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-min-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-min-non-imm.ll
+new file mode 100644
+index 000000000000..5d9b98cec4d0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-min-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vmini_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmini.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vmini_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmini.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmini.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vmini_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmini.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmini.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vmini_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmini.d(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vmini.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vmini_bu(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vmini.bu(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vmini.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vmini_hu(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vmini.hu(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vmini.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vmini_wu(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vmini.wu(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vmini.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vmini_du(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vmini.du(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-nori-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-nori-invalid-imm.ll
+new file mode 100644
+index 000000000000..8c59d8fb9fa5
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-nori-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vnori.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vnori_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vnori.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vnori.b(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vnori_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vnori.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vnori.b(<16 x i8> %va, i32 256)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-nori-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-nori-non-imm.ll
+new file mode 100644
+index 000000000000..322a39c106a6
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-nori-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vnori.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vnori_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vnori.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ori-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ori-invalid-imm.ll
+new file mode 100644
+index 000000000000..4a7fc7e109d9
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ori-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vori.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vori_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vori.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vori.b(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vori_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vori.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vori.b(<16 x i8> %va, i32 256)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ori-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ori-non-imm.ll
+new file mode 100644
+index 000000000000..5644b8581dce
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ori-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vori.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vori_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vori.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-permi-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-permi-invalid-imm.ll
+new file mode 100644
+index 000000000000..e439bbae6130
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-permi-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <4 x i32> @llvm.loongarch.lsx.vpermi.w(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vpermi_w_lo(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vpermi.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vpermi.w(<4 x i32> %va, <4 x i32> %vb, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vpermi_w_hi(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vpermi.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vpermi.w(<4 x i32> %va, <4 x i32> %vb, i32 256)
++  ret <4 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-permi-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-permi-non-imm.ll
+new file mode 100644
+index 000000000000..bdfc08ed680a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-permi-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <4 x i32> @llvm.loongarch.lsx.vpermi.w(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vpermi_w(<4 x i32> %va, <4 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vpermi.w(<4 x i32> %va, <4 x i32> %vb, i32 %c)
++  ret <4 x i32> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-invalid-imm.ll
+new file mode 100644
+index 000000000000..3430c54d2194
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8>, i32)
++
++define i32 @lsx_vpickve2gr_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vpickve2gr.b: argument out of range
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8> %va, i32 -1)
++  ret i32 %res
++}
++
++define i32 @lsx_vpickve2gr_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vpickve2gr.b: argument out of range
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8> %va, i32 16)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.h(<8 x i16>, i32)
++
++define i32 @lsx_vpickve2gr_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vpickve2gr.h: argument out of range
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.h(<8 x i16> %va, i32 -1)
++  ret i32 %res
++}
++
++define i32 @lsx_vpickve2gr_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vpickve2gr.h: argument out of range
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.h(<8 x i16> %va, i32 8)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.w(<4 x i32>, i32)
++
++define i32 @lsx_vpickve2gr_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vpickve2gr.w: argument out of range
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.w(<4 x i32> %va, i32 -1)
++  ret i32 %res
++}
++
++define i32 @lsx_vpickve2gr_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vpickve2gr.w: argument out of range
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.w(<4 x i32> %va, i32 4)
++  ret i32 %res
++}
++
++declare i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64>, i32)
++
++define i64 @lsx_vpickve2gr_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vpickve2gr.d: argument out of range
++entry:
++  %res = call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> %va, i32 -1)
++  ret i64 %res
++}
++
++define i64 @lsx_vpickve2gr_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vpickve2gr.d: argument out of range
++entry:
++  %res = call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> %va, i32 2)
++  ret i64 %res
++}
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8>, i32)
++
++define i32 @lsx_vpickve2gr_bu_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vpickve2gr.bu: argument out of range
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8> %va, i32 -1)
++  ret i32 %res
++}
++
++define i32 @lsx_vpickve2gr_bu_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vpickve2gr.bu: argument out of range
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8> %va, i32 16)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.hu(<8 x i16>, i32)
++
++define i32 @lsx_vpickve2gr_hu_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vpickve2gr.hu: argument out of range
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.hu(<8 x i16> %va, i32 -1)
++  ret i32 %res
++}
++
++define i32 @lsx_vpickve2gr_hu_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vpickve2gr.hu: argument out of range
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.hu(<8 x i16> %va, i32 8)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32>, i32)
++
++define i32 @lsx_vpickve2gr_wu_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vpickve2gr.wu: argument out of range
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> %va, i32 -1)
++  ret i32 %res
++}
++
++define i32 @lsx_vpickve2gr_wu_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vpickve2gr.wu: argument out of range
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> %va, i32 4)
++  ret i32 %res
++}
++
++declare i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64>, i32)
++
++define i64 @lsx_vpickve2gr_du_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vpickve2gr.du: argument out of range
++entry:
++  %res = call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> %va, i32 -1)
++  ret i64 %res
++}
++
++define i64 @lsx_vpickve2gr_du_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vpickve2gr.du: argument out of range
++entry:
++  %res = call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> %va, i32 2)
++  ret i64 %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-non-imm.ll
+new file mode 100644
+index 000000000000..6dd3c1f27a81
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8>, i32)
++
++define i32 @lsx_vpickve2gr_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8> %va, i32 %b)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.h(<8 x i16>, i32)
++
++define i32 @lsx_vpickve2gr_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.h(<8 x i16> %va, i32 %b)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.w(<4 x i32>, i32)
++
++define i32 @lsx_vpickve2gr_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.w(<4 x i32> %va, i32 %b)
++  ret i32 %res
++}
++
++declare i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64>, i32)
++
++define i64 @lsx_vpickve2gr_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> %va, i32 %b)
++  ret i64 %res
++}
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8>, i32)
++
++define i32 @lsx_vpickve2gr_bu(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8> %va, i32 %b)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.hu(<8 x i16>, i32)
++
++define i32 @lsx_vpickve2gr_hu(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.hu(<8 x i16> %va, i32 %b)
++  ret i32 %res
++}
++
++declare i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32>, i32)
++
++define i32 @lsx_vpickve2gr_wu(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> %va, i32 %b)
++  ret i32 %res
++}
++
++declare i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64>, i32)
++
++define i64 @lsx_vpickve2gr_du(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> %va, i32 %b)
++  ret i64 %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replvei-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replvei-invalid-imm.ll
+new file mode 100644
+index 000000000000..d625441122a6
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replvei-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vreplvei.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vreplvei_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vreplvei.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vreplvei.b(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vreplvei_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vreplvei.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vreplvei.b(<16 x i8> %va, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vreplvei.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vreplvei_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vreplvei.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vreplvei.h(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vreplvei_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vreplvei.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vreplvei.h(<8 x i16> %va, i32 8)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vreplvei.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vreplvei_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vreplvei.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vreplvei.w(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vreplvei_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vreplvei.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vreplvei.w(<4 x i32> %va, i32 4)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vreplvei.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vreplvei_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vreplvei.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vreplvei.d(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vreplvei_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vreplvei.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vreplvei.d(<2 x i64> %va, i32 2)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replvei-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replvei-non-imm.ll
+new file mode 100644
+index 000000000000..3d271bb2b307
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replvei-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vreplvei.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vreplvei_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vreplvei.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vreplvei.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vreplvei_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vreplvei.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vreplvei.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vreplvei_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vreplvei.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vreplvei.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vreplvei_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vreplvei.d(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-rotr-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-rotr-invalid-imm.ll
+new file mode 100644
+index 000000000000..3c53b36672ad
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-rotr-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vrotri.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vrotri_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vrotri.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vrotri.b(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vrotri_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vrotri.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vrotri.b(<16 x i8> %va, i32 8)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vrotri.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vrotri_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vrotri.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vrotri.h(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vrotri_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vrotri.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vrotri.h(<8 x i16> %va, i32 16)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vrotri.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vrotri_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vrotri.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vrotri.w(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vrotri_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vrotri.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vrotri.w(<4 x i32> %va, i32 32)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vrotri.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vrotri_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vrotri.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vrotri.d(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vrotri_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vrotri.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vrotri.d(<2 x i64> %va, i32 64)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-rotr-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-rotr-non-imm.ll
+new file mode 100644
+index 000000000000..fd8ba3a1c633
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-rotr-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vrotri.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vrotri_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vrotri.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vrotri.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vrotri_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vrotri.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vrotri.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vrotri_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vrotri.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vrotri.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vrotri_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vrotri.d(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sat-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sat-invalid-imm.ll
+new file mode 100644
+index 000000000000..45fa4e43be19
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sat-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsat.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsat_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsat.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsat.b(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vsat_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsat.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsat.b(<16 x i8> %va, i32 8)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsat.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsat_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsat.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsat.h(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vsat_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsat.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsat.h(<8 x i16> %va, i32 16)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsat.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsat_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsat.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsat.w(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vsat_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsat.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsat.w(<4 x i32> %va, i32 32)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsat.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsat_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsat.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsat.d(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vsat_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsat.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsat.d(<2 x i64> %va, i32 64)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vsat.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsat_bu_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsat.bu: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsat.bu(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vsat_bu_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsat.bu: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsat.bu(<16 x i8> %va, i32 8)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsat.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsat_hu_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsat.hu: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsat.hu(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vsat_hu_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsat.hu: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsat.hu(<8 x i16> %va, i32 16)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsat.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsat_wu_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsat.wu: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsat.wu(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vsat_wu_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsat.wu: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsat.wu(<4 x i32> %va, i32 32)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsat.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsat_du_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsat.du: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsat.du(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vsat_du_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsat.du: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsat.du(<2 x i64> %va, i32 64)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sat-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sat-non-imm.ll
+new file mode 100644
+index 000000000000..afdbe0c1ce0b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sat-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsat.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsat_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsat.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsat.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsat_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsat.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsat.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsat_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsat.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsat.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsat_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsat.d(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vsat.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsat_bu(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsat.bu(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsat.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsat_hu(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsat.hu(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsat.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsat_wu(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsat.wu(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsat.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsat_du(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsat.du(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-seq-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-seq-invalid-imm.ll
+new file mode 100644
+index 000000000000..220398ff28cd
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-seq-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vseqi.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vseqi_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vseqi.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vseqi.b(<16 x i8> %va, i32 -17)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vseqi_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vseqi.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vseqi.b(<16 x i8> %va, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vseqi.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vseqi_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vseqi.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vseqi.h(<8 x i16> %va, i32 -17)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vseqi_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vseqi.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vseqi.h(<8 x i16> %va, i32 16)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vseqi.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vseqi_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vseqi.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vseqi.w(<4 x i32> %va, i32 -17)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vseqi_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vseqi.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vseqi.w(<4 x i32> %va, i32 16)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vseqi.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vseqi_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vseqi.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vseqi.d(<2 x i64> %va, i32 -17)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vseqi_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vseqi.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vseqi.d(<2 x i64> %va, i32 16)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-seq-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-seq-non-imm.ll
+new file mode 100644
+index 000000000000..5fa1dd30475c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-seq-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vseqi.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vseqi_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vseqi.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vseqi.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vseqi_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vseqi.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vseqi.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vseqi_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vseqi.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vseqi.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vseqi_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vseqi.d(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-shuf4i-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-shuf4i-invalid-imm.ll
+new file mode 100644
+index 000000000000..4d6fadf08c26
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-shuf4i-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vshuf4i.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vshuf4i_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vshuf4i.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vshuf4i.b(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vshuf4i_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vshuf4i.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vshuf4i.b(<16 x i8> %va, i32 256)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vshuf4i.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vshuf4i_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vshuf4i.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vshuf4i.h(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vshuf4i_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vshuf4i.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vshuf4i.h(<8 x i16> %va, i32 256)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vshuf4i.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vshuf4i_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vshuf4i.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vshuf4i.w(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vshuf4i_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vshuf4i.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vshuf4i.w(<4 x i32> %va, i32 256)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vshuf4i.d(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vshuf4i_d_lo(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vshuf4i.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vshuf4i.d(<2 x i64> %va, <2 x i64> %vb, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vshuf4i_d_hi(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vshuf4i.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vshuf4i.d(<2 x i64> %va, <2 x i64> %vb, i32 256)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-shuf4i-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-shuf4i-non-imm.ll
+new file mode 100644
+index 000000000000..a7d138bcc00b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-shuf4i-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vshuf4i.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vshuf4i_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vshuf4i.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vshuf4i.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vshuf4i_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vshuf4i.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vshuf4i.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vshuf4i_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vshuf4i.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vshuf4i.d(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vshuf4i_d(<2 x i64> %va, <2 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vshuf4i.d(<2 x i64> %va, <2 x i64> %vb, i32 %c)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sle-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sle-invalid-imm.ll
+new file mode 100644
+index 000000000000..4c945e296711
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sle-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vslei.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vslei_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslei.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslei.b(<16 x i8> %va, i32 -17)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vslei_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslei.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslei.b(<16 x i8> %va, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vslei.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vslei_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslei.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslei.h(<8 x i16> %va, i32 -17)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vslei_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslei.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslei.h(<8 x i16> %va, i32 16)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vslei.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vslei_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslei.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslei.w(<4 x i32> %va, i32 -17)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vslei_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslei.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslei.w(<4 x i32> %va, i32 16)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vslei.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vslei_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslei.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslei.d(<2 x i64> %va, i32 -17)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vslei_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslei.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslei.d(<2 x i64> %va, i32 16)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vslei.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vslei_bu_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslei.bu: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslei.bu(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vslei_bu_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslei.bu: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslei.bu(<16 x i8> %va, i32 32)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vslei.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vslei_hu_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslei.hu: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslei.hu(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vslei_hu_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslei.hu: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslei.hu(<8 x i16> %va, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vslei.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vslei_wu_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslei.wu: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslei.wu(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vslei_wu_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslei.wu: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslei.wu(<4 x i32> %va, i32 32)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vslei.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vslei_du_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslei.du: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslei.du(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vslei_du_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslei.du: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslei.du(<2 x i64> %va, i32 32)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sle-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sle-non-imm.ll
+new file mode 100644
+index 000000000000..0fc137bf0549
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sle-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vslei.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vslei_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslei.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vslei.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vslei_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslei.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vslei.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vslei_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslei.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vslei.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vslei_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslei.d(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vslei.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vslei_bu(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslei.bu(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vslei.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vslei_hu(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslei.hu(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vslei.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vslei_wu(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslei.wu(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vslei.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vslei_du(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslei.du(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sll-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sll-invalid-imm.ll
+new file mode 100644
+index 000000000000..75406f94887c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sll-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vslli.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vslli_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslli.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslli.b(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vslli_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslli.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslli.b(<16 x i8> %va, i32 8)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vslli.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vslli_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslli.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslli.h(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vslli_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslli.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslli.h(<8 x i16> %va, i32 16)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vslli.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vslli_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslli.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslli.w(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vslli_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslli.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslli.w(<4 x i32> %va, i32 32)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vslli.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vslli_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslli.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslli.d(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vslli_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslli.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslli.d(<2 x i64> %va, i32 64)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sll-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sll-non-imm.ll
+new file mode 100644
+index 000000000000..7474b5e29734
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sll-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vslli.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vslli_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslli.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vslli.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vslli_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslli.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vslli.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vslli_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslli.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vslli.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vslli_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslli.d(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sllwil-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sllwil-invalid-imm.ll
+new file mode 100644
+index 000000000000..bda3523a0b5c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sllwil-invalid-imm.ll
+@@ -0,0 +1,97 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <8 x i16> @llvm.loongarch.lsx.vsllwil.h.b(<16 x i8>, i32)
++
++define <8 x i16> @lsx_vsllwil_h_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsllwil.h.b: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsllwil.h.b(<16 x i8> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vsllwil_h_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsllwil.h.b: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsllwil.h.b(<16 x i8> %va, i32 8)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsllwil.w.h(<8 x i16>, i32)
++
++define <4 x i32> @lsx_vsllwil_w_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsllwil.w.h: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsllwil.w.h(<8 x i16> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vsllwil_w_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsllwil.w.h: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsllwil.w.h(<8 x i16> %va, i32 16)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsllwil.d.w(<4 x i32>, i32)
++
++define <2 x i64> @lsx_vsllwil_d_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsllwil.d.w: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsllwil.d.w(<4 x i32> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vsllwil_d_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsllwil.d.w: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsllwil.d.w(<4 x i32> %va, i32 32)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsllwil.hu.bu(<16 x i8>, i32)
++
++define <8 x i16> @lsx_vsllwil_hu_bu_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsllwil.hu.bu: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsllwil.hu.bu(<16 x i8> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vsllwil_hu_bu_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsllwil.hu.bu: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsllwil.hu.bu(<16 x i8> %va, i32 8)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsllwil.wu.hu(<8 x i16>, i32)
++
++define <4 x i32> @lsx_vsllwil_wu_hu_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsllwil.wu.hu: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsllwil.wu.hu(<8 x i16> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vsllwil_wu_hu_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsllwil.wu.hu: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsllwil.wu.hu(<8 x i16> %va, i32 16)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsllwil.du.wu(<4 x i32>, i32)
++
++define <2 x i64> @lsx_vsllwil_du_wu_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsllwil.du.wu: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsllwil.du.wu(<4 x i32> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vsllwil_du_wu_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsllwil.du.wu: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsllwil.du.wu(<4 x i32> %va, i32 32)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sllwil-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sllwil-non-imm.ll
+new file mode 100644
+index 000000000000..a03656d5ca07
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sllwil-non-imm.ll
+@@ -0,0 +1,55 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <8 x i16> @llvm.loongarch.lsx.vsllwil.h.b(<16 x i8>, i32)
++
++define <8 x i16> @lsx_vsllwil_h_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsllwil.h.b(<16 x i8> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsllwil.w.h(<8 x i16>, i32)
++
++define <4 x i32> @lsx_vsllwil_w_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsllwil.w.h(<8 x i16> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsllwil.d.w(<4 x i32>, i32)
++
++define <2 x i64> @lsx_vsllwil_d_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsllwil.d.w(<4 x i32> %va, i32 %b)
++  ret <2 x i64> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsllwil.hu.bu(<16 x i8>, i32)
++
++define <8 x i16> @lsx_vsllwil_hu_bu(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsllwil.hu.bu(<16 x i8> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsllwil.wu.hu(<8 x i16>, i32)
++
++define <4 x i32> @lsx_vsllwil_wu_hu(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsllwil.wu.hu(<8 x i16> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsllwil.du.wu(<4 x i32>, i32)
++
++define <2 x i64> @lsx_vsllwil_du_wu(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsllwil.du.wu(<4 x i32> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-slt-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-slt-invalid-imm.ll
+new file mode 100644
+index 000000000000..f6d014b19d6c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-slt-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vslti.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vslti_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslti.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslti.b(<16 x i8> %va, i32 -17)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vslti_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslti.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslti.b(<16 x i8> %va, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vslti.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vslti_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslti.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslti.h(<8 x i16> %va, i32 -17)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vslti_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslti.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslti.h(<8 x i16> %va, i32 16)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vslti.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vslti_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslti.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslti.w(<4 x i32> %va, i32 -17)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vslti_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslti.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslti.w(<4 x i32> %va, i32 16)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vslti.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vslti_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslti.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslti.d(<2 x i64> %va, i32 -17)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vslti_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslti.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslti.d(<2 x i64> %va, i32 16)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vslti.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vslti_bu_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslti.bu: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslti.bu(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vslti_bu_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslti.bu: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslti.bu(<16 x i8> %va, i32 32)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vslti.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vslti_hu_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslti.hu: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslti.hu(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vslti_hu_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslti.hu: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslti.hu(<8 x i16> %va, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vslti.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vslti_wu_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslti.wu: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslti.wu(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vslti_wu_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslti.wu: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslti.wu(<4 x i32> %va, i32 32)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vslti.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vslti_du_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslti.du: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslti.du(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vslti_du_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vslti.du: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslti.du(<2 x i64> %va, i32 32)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-slt-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-slt-non-imm.ll
+new file mode 100644
+index 000000000000..9a8b757dab4e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-slt-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vslti.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vslti_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslti.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vslti.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vslti_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslti.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vslti.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vslti_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslti.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vslti.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vslti_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslti.d(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vslti.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vslti_bu(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vslti.bu(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vslti.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vslti_hu(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vslti.hu(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vslti.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vslti_wu(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vslti.wu(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vslti.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vslti_du(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vslti.du(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sra-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sra-invalid-imm.ll
+new file mode 100644
+index 000000000000..2a033a21b565
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sra-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrai.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrai_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrai.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrai.b(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vsrai_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrai.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrai.b(<16 x i8> %va, i32 8)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrai.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrai_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrai.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrai.h(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vsrai_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrai.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrai.h(<8 x i16> %va, i32 16)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrai.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrai_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrai.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrai.w(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vsrai_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrai.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrai.w(<4 x i32> %va, i32 32)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrai.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrai_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrai.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrai.d(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vsrai_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrai.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrai.d(<2 x i64> %va, i32 64)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sra-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sra-non-imm.ll
+new file mode 100644
+index 000000000000..c3b328145864
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-sra-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrai.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrai_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrai.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrai.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrai_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrai.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrai.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrai_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrai.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrai.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrai_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrai.d(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srani-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srani-invalid-imm.ll
+new file mode 100644
+index 000000000000..d68064e9b902
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srani-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrani.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrani_b_h_lo(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrani.b.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrani.b.h(<16 x i8> %va, <16 x i8> %vb, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vsrani_b_h_hi(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrani.b.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrani.b.h(<16 x i8> %va, <16 x i8> %vb, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrani.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrani_h_w_lo(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrani.h.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrani.h.w(<8 x i16> %va, <8 x i16> %vb, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vsrani_h_w_hi(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrani.h.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrani.h.w(<8 x i16> %va, <8 x i16> %vb, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrani.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrani_w_d_lo(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrani.w.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrani.w.d(<4 x i32> %va, <4 x i32> %vb, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vsrani_w_d_hi(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrani.w.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrani.w.d(<4 x i32> %va, <4 x i32> %vb, i32 64)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrani.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrani_d_q_lo(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrani.d.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrani.d.q(<2 x i64> %va, <2 x i64> %vb, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vsrani_d_q_hi(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrani.d.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrani.d.q(<2 x i64> %va, <2 x i64> %vb, i32 128)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srani-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srani-non-imm.ll
+new file mode 100644
+index 000000000000..38cfde214dc1
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srani-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrani.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrani_b_h(<16 x i8> %va, <16 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrani.b.h(<16 x i8> %va, <16 x i8> %vb, i32 %c)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrani.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrani_h_w(<8 x i16> %va, <8 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrani.h.w(<8 x i16> %va, <8 x i16> %vb, i32 %c)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrani.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrani_w_d(<4 x i32> %va, <4 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrani.w.d(<4 x i32> %va, <4 x i32> %vb, i32 %c)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrani.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrani_d_q(<2 x i64> %va, <2 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrani.d.q(<2 x i64> %va, <2 x i64> %vb, i32 %c)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srar-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srar-invalid-imm.ll
+new file mode 100644
+index 000000000000..b6c2d70cebbc
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srar-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrari.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrari_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrari.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrari.b(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vsrari_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrari.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrari.b(<16 x i8> %va, i32 8)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrari.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrari_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrari.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrari.h(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vsrari_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrari.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrari.h(<8 x i16> %va, i32 16)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrari.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrari_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrari.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrari.w(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vsrari_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrari.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrari.w(<4 x i32> %va, i32 32)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrari.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrari_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrari.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrari.d(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vsrari_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrari.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrari.d(<2 x i64> %va, i32 64)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srar-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srar-non-imm.ll
+new file mode 100644
+index 000000000000..2ad8adcd823b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srar-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrari.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrari_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrari.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrari.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrari_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrari.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrari.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrari_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrari.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrari.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrari_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrari.d(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srarni-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srarni-invalid-imm.ll
+new file mode 100644
+index 000000000000..d24cf92a0392
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srarni-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrarni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrarni_b_h_lo(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrarni.b.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrarni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vsrarni_b_h_hi(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrarni.b.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrarni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrarni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrarni_h_w_lo(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrarni.h.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrarni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vsrarni_h_w_hi(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrarni.h.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrarni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrarni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrarni_w_d_lo(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrarni.w.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrarni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vsrarni_w_d_hi(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrarni.w.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrarni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 64)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrarni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrarni_d_q_lo(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrarni.d.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrarni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vsrarni_d_q_hi(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrarni.d.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrarni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 128)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srarni-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srarni-non-imm.ll
+new file mode 100644
+index 000000000000..19de7445cba1
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srarni-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrarni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrarni_b_h(<16 x i8> %va, <16 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrarni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 %c)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrarni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrarni_h_w(<8 x i16> %va, <8 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrarni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 %c)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrarni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrarni_w_d(<4 x i32> %va, <4 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrarni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 %c)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrarni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrarni_d_q(<2 x i64> %va, <2 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrarni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 %c)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srl-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srl-invalid-imm.ll
+new file mode 100644
+index 000000000000..3beff790afab
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srl-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrli.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrli_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrli.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrli.b(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vsrli_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrli.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrli.b(<16 x i8> %va, i32 8)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrli.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrli_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrli.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrli.h(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vsrli_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrli.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrli.h(<8 x i16> %va, i32 16)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrli.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrli_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrli.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrli.w(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vsrli_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrli.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrli.w(<4 x i32> %va, i32 32)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrli.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrli_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrli.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrli.d(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vsrli_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrli.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrli.d(<2 x i64> %va, i32 64)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srl-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srl-non-imm.ll
+new file mode 100644
+index 000000000000..98652aca0d62
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srl-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrli.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrli_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrli.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrli.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrli_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrli.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrli.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrli_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrli.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrli.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrli_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrli.d(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlni-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlni-invalid-imm.ll
+new file mode 100644
+index 000000000000..054c4f393548
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlni-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrlni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrlni_b_h_lo(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlni.b.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrlni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vsrlni_b_h_hi(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlni.b.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrlni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrlni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrlni_h_w_lo(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlni.h.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrlni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vsrlni_h_w_hi(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlni.h.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrlni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrlni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrlni_w_d_lo(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlni.w.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrlni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vsrlni_w_d_hi(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlni.w.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrlni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 64)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrlni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrlni_d_q_lo(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlni.d.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrlni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vsrlni_d_q_hi(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlni.d.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrlni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 128)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlni-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlni-non-imm.ll
+new file mode 100644
+index 000000000000..76341df197fd
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlni-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrlni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrlni_b_h(<16 x i8> %va, <16 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrlni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 %c)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrlni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrlni_h_w(<8 x i16> %va, <8 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrlni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 %c)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrlni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrlni_w_d(<4 x i32> %va, <4 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrlni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 %c)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrlni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrlni_d_q(<2 x i64> %va, <2 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrlni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 %c)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlr-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlr-invalid-imm.ll
+new file mode 100644
+index 000000000000..bcbd38e26e5f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlr-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrlri.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrlri_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlri.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrlri.b(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vsrlri_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlri.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrlri.b(<16 x i8> %va, i32 8)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrlri.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrlri_h_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlri.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrlri.h(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vsrlri_h_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlri.h: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrlri.h(<8 x i16> %va, i32 16)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrlri.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrlri_w_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlri.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrlri.w(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vsrlri_w_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlri.w: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrlri.w(<4 x i32> %va, i32 32)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrlri.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrlri_d_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlri.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrlri.d(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vsrlri_d_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlri.d: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrlri.d(<2 x i64> %va, i32 64)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlr-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlr-non-imm.ll
+new file mode 100644
+index 000000000000..4862b1546ccf
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlr-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrlri.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrlri_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrlri.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrlri.h(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrlri_h(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrlri.h(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrlri.w(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrlri_w(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrlri.w(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrlri.d(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrlri_d(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrlri.d(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlrni-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlrni-invalid-imm.ll
+new file mode 100644
+index 000000000000..8988ae88f9eb
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlrni-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrlrni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrlrni_b_h_lo(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlrni.b.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrlrni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vsrlrni_b_h_hi(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlrni.b.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrlrni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrlrni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrlrni_h_w_lo(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlrni.h.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrlrni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vsrlrni_h_w_hi(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlrni.h.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrlrni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrlrni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrlrni_w_d_lo(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlrni.w.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrlrni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vsrlrni_w_d_hi(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlrni.w.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrlrni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 64)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrlrni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrlrni_d_q_lo(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlrni.d.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrlrni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vsrlrni_d_q_hi(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vsrlrni.d.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrlrni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 128)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlrni-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlrni-non-imm.ll
+new file mode 100644
+index 000000000000..e5530db56fed
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-srlrni-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsrlrni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vsrlrni_b_h(<16 x i8> %va, <16 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsrlrni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 %c)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsrlrni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vsrlrni_h_w(<8 x i16> %va, <8 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsrlrni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 %c)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsrlrni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vsrlrni_w_d(<4 x i32> %va, <4 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsrlrni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 %c)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsrlrni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vsrlrni_d_q(<2 x i64> %va, <2 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsrlrni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 %c)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrani-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrani-invalid-imm.ll
+new file mode 100644
+index 000000000000..f7817921ebeb
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrani-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrani.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrani_b_h_lo(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrani.b.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrani.b.h(<16 x i8> %va, <16 x i8> %vb, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vssrani_b_h_hi(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrani.b.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrani.b.h(<16 x i8> %va, <16 x i8> %vb, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrani.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrani_h_w_lo(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrani.h.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrani.h.w(<8 x i16> %va, <8 x i16> %vb, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vssrani_h_w_hi(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrani.h.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrani.h.w(<8 x i16> %va, <8 x i16> %vb, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrani.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrani_w_d_lo(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrani.w.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrani.w.d(<4 x i32> %va, <4 x i32> %vb, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vssrani_w_d_hi(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrani.w.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrani.w.d(<4 x i32> %va, <4 x i32> %vb, i32 64)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrani.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrani_d_q_lo(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrani.d.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrani.d.q(<2 x i64> %va, <2 x i64> %vb, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vssrani_d_q_hi(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrani.d.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrani.d.q(<2 x i64> %va, <2 x i64> %vb, i32 128)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrani.bu.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrani_bu_h_lo(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrani.bu.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrani.bu.h(<16 x i8> %va, <16 x i8> %vb, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vssrani_bu_h_hi(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrani.bu.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrani.bu.h(<16 x i8> %va, <16 x i8> %vb, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrani.hu.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrani_hu_w_lo(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrani.hu.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrani.hu.w(<8 x i16> %va, <8 x i16> %vb, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vssrani_hu_w_hi(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrani.hu.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrani.hu.w(<8 x i16> %va, <8 x i16> %vb, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrani.wu.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrani_wu_d_lo(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrani.wu.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrani.wu.d(<4 x i32> %va, <4 x i32> %vb, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vssrani_wu_d_hi(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrani.wu.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrani.wu.d(<4 x i32> %va, <4 x i32> %vb, i32 64)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrani.du.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrani_du_q_lo(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrani.du.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrani.du.q(<2 x i64> %va, <2 x i64> %vb, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vssrani_du_q_hi(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrani.du.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrani.du.q(<2 x i64> %va, <2 x i64> %vb, i32 128)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrani-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrani-non-imm.ll
+new file mode 100644
+index 000000000000..a80ede9c5243
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrani-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrani.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrani_b_h(<16 x i8> %va, <16 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrani.b.h(<16 x i8> %va, <16 x i8> %vb, i32 %c)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrani.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrani_h_w(<8 x i16> %va, <8 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrani.h.w(<8 x i16> %va, <8 x i16> %vb, i32 %c)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrani.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrani_w_d(<4 x i32> %va, <4 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrani.w.d(<4 x i32> %va, <4 x i32> %vb, i32 %c)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrani.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrani_d_q(<2 x i64> %va, <2 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrani.d.q(<2 x i64> %va, <2 x i64> %vb, i32 %c)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrani.bu.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrani_bu_h(<16 x i8> %va, <16 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrani.bu.h(<16 x i8> %va, <16 x i8> %vb, i32 %c)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrani.hu.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrani_hu_w(<8 x i16> %va, <8 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrani.hu.w(<8 x i16> %va, <8 x i16> %vb, i32 %c)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrani.wu.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrani_wu_d(<4 x i32> %va, <4 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrani.wu.d(<4 x i32> %va, <4 x i32> %vb, i32 %c)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrani.du.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrani_du_q(<2 x i64> %va, <2 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrani.du.q(<2 x i64> %va, <2 x i64> %vb, i32 %c)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrarni-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrarni-invalid-imm.ll
+new file mode 100644
+index 000000000000..4edda8c0a24a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrarni-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrarni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrarni_b_h_lo(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrarni.b.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrarni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vssrarni_b_h_hi(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrarni.b.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrarni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrarni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrarni_h_w_lo(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrarni.h.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrarni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vssrarni_h_w_hi(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrarni.h.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrarni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrarni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrarni_w_d_lo(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrarni.w.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrarni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vssrarni_w_d_hi(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrarni.w.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrarni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 64)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrarni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrarni_d_q_lo(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrarni.d.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrarni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vssrarni_d_q_hi(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrarni.d.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrarni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 128)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrarni.bu.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrarni_bu_h_lo(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrarni.bu.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrarni.bu.h(<16 x i8> %va, <16 x i8> %vb, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vssrarni_bu_h_hi(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrarni.bu.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrarni.bu.h(<16 x i8> %va, <16 x i8> %vb, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrarni.hu.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrarni_hu_w_lo(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrarni.hu.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrarni.hu.w(<8 x i16> %va, <8 x i16> %vb, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vssrarni_hu_w_hi(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrarni.hu.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrarni.hu.w(<8 x i16> %va, <8 x i16> %vb, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrarni.wu.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrarni_wu_d_lo(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrarni.wu.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrarni.wu.d(<4 x i32> %va, <4 x i32> %vb, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vssrarni_wu_d_hi(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrarni.wu.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrarni.wu.d(<4 x i32> %va, <4 x i32> %vb, i32 64)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrarni.du.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrarni_du_q_lo(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrarni.du.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrarni.du.q(<2 x i64> %va, <2 x i64> %vb, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vssrarni_du_q_hi(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrarni.du.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrarni.du.q(<2 x i64> %va, <2 x i64> %vb, i32 128)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrarni-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrarni-non-imm.ll
+new file mode 100644
+index 000000000000..a77e6e764c9d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrarni-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrarni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrarni_b_h(<16 x i8> %va, <16 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrarni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 %c)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrarni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrarni_h_w(<8 x i16> %va, <8 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrarni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 %c)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrarni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrarni_w_d(<4 x i32> %va, <4 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrarni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 %c)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrarni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrarni_d_q(<2 x i64> %va, <2 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrarni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 %c)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrarni.bu.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrarni_bu_h(<16 x i8> %va, <16 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrarni.bu.h(<16 x i8> %va, <16 x i8> %vb, i32 %c)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrarni.hu.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrarni_hu_w(<8 x i16> %va, <8 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrarni.hu.w(<8 x i16> %va, <8 x i16> %vb, i32 %c)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrarni.wu.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrarni_wu_d(<4 x i32> %va, <4 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrarni.wu.d(<4 x i32> %va, <4 x i32> %vb, i32 %c)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrarni.du.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrarni_du_q(<2 x i64> %va, <2 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrarni.du.q(<2 x i64> %va, <2 x i64> %vb, i32 %c)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlni-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlni-invalid-imm.ll
+new file mode 100644
+index 000000000000..6218af1fa773
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlni-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrlni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrlni_b_h_lo(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlni.b.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vssrlni_b_h_hi(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlni.b.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrlni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrlni_h_w_lo(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlni.h.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vssrlni_h_w_hi(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlni.h.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrlni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrlni_w_d_lo(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlni.w.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vssrlni_w_d_hi(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlni.w.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 64)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrlni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrlni_d_q_lo(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlni.d.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrlni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vssrlni_d_q_hi(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlni.d.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrlni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 128)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrlni.bu.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrlni_bu_h_lo(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlni.bu.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlni.bu.h(<16 x i8> %va, <16 x i8> %vb, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vssrlni_bu_h_hi(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlni.bu.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlni.bu.h(<16 x i8> %va, <16 x i8> %vb, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrlni.hu.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrlni_hu_w_lo(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlni.hu.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlni.hu.w(<8 x i16> %va, <8 x i16> %vb, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vssrlni_hu_w_hi(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlni.hu.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlni.hu.w(<8 x i16> %va, <8 x i16> %vb, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrlni.wu.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrlni_wu_d_lo(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlni.wu.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlni.wu.d(<4 x i32> %va, <4 x i32> %vb, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vssrlni_wu_d_hi(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlni.wu.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlni.wu.d(<4 x i32> %va, <4 x i32> %vb, i32 64)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrlni.du.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrlni_du_q_lo(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlni.du.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrlni.du.q(<2 x i64> %va, <2 x i64> %vb, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vssrlni_du_q_hi(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlni.du.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrlni.du.q(<2 x i64> %va, <2 x i64> %vb, i32 128)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlni-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlni-non-imm.ll
+new file mode 100644
+index 000000000000..688be826f467
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlni-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrlni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrlni_b_h(<16 x i8> %va, <16 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 %c)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrlni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrlni_h_w(<8 x i16> %va, <8 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 %c)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrlni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrlni_w_d(<4 x i32> %va, <4 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 %c)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrlni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrlni_d_q(<2 x i64> %va, <2 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrlni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 %c)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrlni.bu.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrlni_bu_h(<16 x i8> %va, <16 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlni.bu.h(<16 x i8> %va, <16 x i8> %vb, i32 %c)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrlni.hu.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrlni_hu_w(<8 x i16> %va, <8 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlni.hu.w(<8 x i16> %va, <8 x i16> %vb, i32 %c)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrlni.wu.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrlni_wu_d(<4 x i32> %va, <4 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlni.wu.d(<4 x i32> %va, <4 x i32> %vb, i32 %c)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrlni.du.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrlni_du_q(<2 x i64> %va, <2 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrlni.du.q(<2 x i64> %va, <2 x i64> %vb, i32 %c)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlrni-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlrni-invalid-imm.ll
+new file mode 100644
+index 000000000000..98a0c5b3cd28
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlrni-invalid-imm.ll
+@@ -0,0 +1,129 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrlrni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrlrni_b_h_lo(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlrni.b.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlrni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vssrlrni_b_h_hi(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlrni.b.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlrni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrlrni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrlrni_h_w_lo(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlrni.h.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlrni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vssrlrni_h_w_hi(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlrni.h.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlrni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrlrni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrlrni_w_d_lo(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlrni.w.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlrni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vssrlrni_w_d_hi(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlrni.w.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlrni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 64)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrlrni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrlrni_d_q_lo(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlrni.d.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrlrni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vssrlrni_d_q_hi(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlrni.d.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrlrni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 128)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrlrni.bu.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrlrni_bu_h_lo(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlrni.bu.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlrni.bu.h(<16 x i8> %va, <16 x i8> %vb, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vssrlrni_bu_h_hi(<16 x i8> %va, <16 x i8> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlrni.bu.h: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlrni.bu.h(<16 x i8> %va, <16 x i8> %vb, i32 16)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrlrni.hu.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrlrni_hu_w_lo(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlrni.hu.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlrni.hu.w(<8 x i16> %va, <8 x i16> %vb, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vssrlrni_hu_w_hi(<8 x i16> %va, <8 x i16> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlrni.hu.w: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlrni.hu.w(<8 x i16> %va, <8 x i16> %vb, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrlrni.wu.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrlrni_wu_d_lo(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlrni.wu.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlrni.wu.d(<4 x i32> %va, <4 x i32> %vb, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vssrlrni_wu_d_hi(<4 x i32> %va, <4 x i32> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlrni.wu.d: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlrni.wu.d(<4 x i32> %va, <4 x i32> %vb, i32 64)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrlrni.du.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrlrni_du_q_lo(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlrni.du.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrlrni.du.q(<2 x i64> %va, <2 x i64> %vb, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vssrlrni_du_q_hi(<2 x i64> %va, <2 x i64> %vb) nounwind {
++; CHECK: llvm.loongarch.lsx.vssrlrni.du.q: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrlrni.du.q(<2 x i64> %va, <2 x i64> %vb, i32 128)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlrni-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlrni-non-imm.ll
+new file mode 100644
+index 000000000000..c389b4fd6023
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-ssrlrni-non-imm.ll
+@@ -0,0 +1,73 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrlrni.b.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrlrni_b_h(<16 x i8> %va, <16 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlrni.b.h(<16 x i8> %va, <16 x i8> %vb, i32 %c)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrlrni.h.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrlrni_h_w(<8 x i16> %va, <8 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlrni.h.w(<8 x i16> %va, <8 x i16> %vb, i32 %c)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrlrni.w.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrlrni_w_d(<4 x i32> %va, <4 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlrni.w.d(<4 x i32> %va, <4 x i32> %vb, i32 %c)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrlrni.d.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrlrni_d_q(<2 x i64> %va, <2 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrlrni.d.q(<2 x i64> %va, <2 x i64> %vb, i32 %c)
++  ret <2 x i64> %res
++}
++
++declare <16 x i8> @llvm.loongarch.lsx.vssrlrni.bu.h(<16 x i8>, <16 x i8>, i32)
++
++define <16 x i8> @lsx_vssrlrni_bu_h(<16 x i8> %va, <16 x i8> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vssrlrni.bu.h(<16 x i8> %va, <16 x i8> %vb, i32 %c)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vssrlrni.hu.w(<8 x i16>, <8 x i16>, i32)
++
++define <8 x i16> @lsx_vssrlrni_hu_w(<8 x i16> %va, <8 x i16> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vssrlrni.hu.w(<8 x i16> %va, <8 x i16> %vb, i32 %c)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vssrlrni.wu.d(<4 x i32>, <4 x i32>, i32)
++
++define <4 x i32> @lsx_vssrlrni_wu_d(<4 x i32> %va, <4 x i32> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vssrlrni.wu.d(<4 x i32> %va, <4 x i32> %vb, i32 %c)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vssrlrni.du.q(<2 x i64>, <2 x i64>, i32)
++
++define <2 x i64> @lsx_vssrlrni_du_q(<2 x i64> %va, <2 x i64> %vb, i32 %c) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vssrlrni.du.q(<2 x i64> %va, <2 x i64> %vb, i32 %c)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-st-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-st-invalid-imm.ll
+new file mode 100644
+index 000000000000..64518380964b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-st-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare void @llvm.loongarch.lsx.vst(<16 x i8>, i8*, i32)
++
++define void @lsx_vst_lo(<16 x i8> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vst: argument out of range
++entry:
++  call void @llvm.loongarch.lsx.vst(<16 x i8> %va, i8* %p, i32 -2049)
++  ret void
++}
++
++define void @lsx_vst_hi(<16 x i8> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vst: argument out of range
++entry:
++  call void @llvm.loongarch.lsx.vst(<16 x i8> %va, i8* %p, i32 2048)
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-st-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-st-non-imm.ll
+new file mode 100644
+index 000000000000..119ed9b78658
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-st-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare void @llvm.loongarch.lsx.vst(<16 x i8>, i8*, i32)
++
++define void @lsx_vst(<16 x i8> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lsx.vst(<16 x i8> %va, i8* %p, i32 %b)
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-stelm-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-stelm-invalid-imm.ll
+new file mode 100644
+index 000000000000..277abcbd34cc
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-stelm-invalid-imm.ll
+@@ -0,0 +1,121 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare void @llvm.loongarch.lsx.vstelm.b(<16 x i8>, i8*, i32, i32)
++
++define void @lsx_vstelm_b_lo(<16 x i8> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vstelm.b: argument out of range
++entry:
++  call void @llvm.loongarch.lsx.vstelm.b(<16 x i8> %va, i8* %p, i32 -129, i32 15)
++  ret void
++}
++
++define void @lsx_vstelm_b_hi(<16 x i8> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vstelm.b: argument out of range
++entry:
++  call void @llvm.loongarch.lsx.vstelm.b(<16 x i8> %va, i8* %p, i32 128, i32 15)
++  ret void
++}
++
++define void @lsx_vstelm_b_idx_lo(<16 x i8> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vstelm.b: argument out of range
++entry:
++  call void @llvm.loongarch.lsx.vstelm.b(<16 x i8> %va, i8* %p, i32 1, i32 -1)
++  ret void
++}
++
++define void @lsx_vstelm_b_idx_hi(<16 x i8> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vstelm.b: argument out of range
++entry:
++  call void @llvm.loongarch.lsx.vstelm.b(<16 x i8> %va, i8* %p, i32 1, i32 16)
++  ret void
++}
++
++declare void @llvm.loongarch.lsx.vstelm.h(<8 x i16>, i8*, i32, i32)
++
++define void @lsx_vstelm_h_lo(<8 x i16> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vstelm.h: argument out of range or not a multiple of 2.
++entry:
++  call void @llvm.loongarch.lsx.vstelm.h(<8 x i16> %va, i8* %p, i32 -258, i32 7)
++  ret void
++}
++
++define void @lsx_vstelm_h_hi(<8 x i16> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vstelm.h: argument out of range or not a multiple of 2.
++entry:
++  call void @llvm.loongarch.lsx.vstelm.h(<8 x i16> %va, i8* %p, i32 256, i32 7)
++  ret void
++}
++
++define void @lsx_vstelm_h_idx_lo(<8 x i16> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vstelm.h: argument out of range or not a multiple of 2.
++entry:
++  call void @llvm.loongarch.lsx.vstelm.h(<8 x i16> %va, i8* %p, i32 2, i32 -1)
++  ret void
++}
++
++define void @lsx_vstelm_h_idx_hi(<8 x i16> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vstelm.h: argument out of range or not a multiple of 2.
++entry:
++  call void @llvm.loongarch.lsx.vstelm.h(<8 x i16> %va, i8* %p, i32 2, i32 8)
++  ret void
++}
++
++declare void @llvm.loongarch.lsx.vstelm.w(<4 x i32>, i8*, i32, i32)
++
++define void @lsx_vstelm_w_lo(<4 x i32> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vstelm.w: argument out of range or not a multiple of 4.
++entry:
++  call void @llvm.loongarch.lsx.vstelm.w(<4 x i32> %va, i8* %p, i32 -516, i32 3)
++  ret void
++}
++
++define void @lsx_vstelm_w_hi(<4 x i32> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vstelm.w: argument out of range or not a multiple of 4.
++entry:
++  call void @llvm.loongarch.lsx.vstelm.w(<4 x i32> %va, i8* %p, i32 512, i32 3)
++  ret void
++}
++
++define void @lsx_vstelm_w_idx_lo(<4 x i32> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vstelm.w: argument out of range or not a multiple of 4.
++entry:
++  call void @llvm.loongarch.lsx.vstelm.w(<4 x i32> %va, i8* %p, i32 4, i32 -1)
++  ret void
++}
++
++define void @lsx_vstelm_w_idx_hi(<4 x i32> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vstelm.w: argument out of range or not a multiple of 4.
++entry:
++  call void @llvm.loongarch.lsx.vstelm.w(<4 x i32> %va, i8* %p, i32 4, i32 4)
++  ret void
++}
++
++declare void @llvm.loongarch.lsx.vstelm.d(<2 x i64>, i8*, i32, i32)
++
++define void @lsx_vstelm_d_lo(<2 x i64> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vstelm.d: argument out of range or not a multiple of 8.
++entry:
++  call void @llvm.loongarch.lsx.vstelm.d(<2 x i64> %va, i8* %p, i32 -1032, i32 1)
++  ret void
++}
++
++define void @lsx_vstelm_d_hi(<2 x i64> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vstelm.d: argument out of range or not a multiple of 8.
++entry:
++  call void @llvm.loongarch.lsx.vstelm.d(<2 x i64> %va, i8* %p, i32 1024, i32 1)
++  ret void
++}
++
++define void @lsx_vstelm_d_idx_lo(<2 x i64> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vstelm.d: argument out of range or not a multiple of 8.
++entry:
++  call void @llvm.loongarch.lsx.vstelm.d(<2 x i64> %va, i8* %p, i32 8, i32 -1)
++  ret void
++}
++
++define void @lsx_vstelm_d_idx_hi(<2 x i64> %va, i8* %p) nounwind {
++; CHECK: llvm.loongarch.lsx.vstelm.d: argument out of range or not a multiple of 8.
++entry:
++  call void @llvm.loongarch.lsx.vstelm.d(<2 x i64> %va, i8* %p, i32 8, i32 2)
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-stelm-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-stelm-non-imm.ll
+new file mode 100644
+index 000000000000..f53932f79035
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-stelm-non-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare void @llvm.loongarch.lsx.vstelm.b(<16 x i8>, i8*, i32, i32)
++
++define void @lsx_vstelm_b(<16 x i8> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lsx.vstelm.b(<16 x i8> %va, i8* %p, i32 %b, i32 1)
++  ret void
++}
++
++define void @lsx_vstelm_b_idx(<16 x i8> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lsx.vstelm.b(<16 x i8> %va, i8* %p, i32 1, i32 %b)
++  ret void
++}
++
++declare void @llvm.loongarch.lsx.vstelm.h(<8 x i16>, i8*, i32, i32)
++
++define void @lsx_vstelm_h(<8 x i16> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lsx.vstelm.h(<8 x i16> %va, i8* %p, i32 %b, i32 1)
++  ret void
++}
++
++define void @lsx_vstelm_h_idx(<8 x i16> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lsx.vstelm.h(<8 x i16> %va, i8* %p, i32 2, i32 %b)
++  ret void
++}
++
++declare void @llvm.loongarch.lsx.vstelm.w(<4 x i32>, i8*, i32, i32)
++
++define void @lsx_vstelm_w(<4 x i32> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lsx.vstelm.w(<4 x i32> %va, i8* %p, i32 %b, i32 1)
++  ret void
++}
++
++define void @lsx_vstelm_w_idx(<4 x i32> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lsx.vstelm.w(<4 x i32> %va, i8* %p, i32 4, i32 %b)
++  ret void
++}
++
++declare void @llvm.loongarch.lsx.vstelm.d(<2 x i64>, i8*, i32, i32)
++
++define void @lsx_vstelm_d(<2 x i64> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lsx.vstelm.d(<2 x i64> %va, i8* %p, i32 %b, i32 1)
++  ret void
++}
++
++define void @lsx_vstelm_d_idx(<2 x i64> %va, i8* %p, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  call void @llvm.loongarch.lsx.vstelm.d(<2 x i64> %va, i8* %p, i32 8, i32 %b)
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-subi-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-subi-invalid-imm.ll
+new file mode 100644
+index 000000000000..96cc1241fbf3
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-subi-invalid-imm.ll
+@@ -0,0 +1,65 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsubi.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsubi_bu_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsubi.bu: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsubi.bu(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vsubi_bu_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsubi.bu: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsubi.bu(<16 x i8> %va, i32 32)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsubi.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsubi_hu_lo(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsubi.hu: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsubi.hu(<8 x i16> %va, i32 -1)
++  ret <8 x i16> %res
++}
++
++define <8 x i16> @lsx_vsubi_hu_hi(<8 x i16> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsubi.hu: argument out of range
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsubi.hu(<8 x i16> %va, i32 32)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsubi.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsubi_wu_lo(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsubi.wu: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsubi.wu(<4 x i32> %va, i32 -1)
++  ret <4 x i32> %res
++}
++
++define <4 x i32> @lsx_vsubi_wu_hi(<4 x i32> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsubi.wu: argument out of range
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsubi.wu(<4 x i32> %va, i32 32)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsubi.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsubi_du_lo(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsubi.du: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsubi.du(<2 x i64> %va, i32 -1)
++  ret <2 x i64> %res
++}
++
++define <2 x i64> @lsx_vsubi_du_hi(<2 x i64> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vsubi.du: argument out of range
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsubi.du(<2 x i64> %va, i32 32)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-subi-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-subi-non-imm.ll
+new file mode 100644
+index 000000000000..162f9ad131c7
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-subi-non-imm.ll
+@@ -0,0 +1,37 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vsubi.bu(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vsubi_bu(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vsubi.bu(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
++
++declare <8 x i16> @llvm.loongarch.lsx.vsubi.hu(<8 x i16>, i32)
++
++define <8 x i16> @lsx_vsubi_hu(<8 x i16> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <8 x i16> @llvm.loongarch.lsx.vsubi.hu(<8 x i16> %va, i32 %b)
++  ret <8 x i16> %res
++}
++
++declare <4 x i32> @llvm.loongarch.lsx.vsubi.wu(<4 x i32>, i32)
++
++define <4 x i32> @lsx_vsubi_wu(<4 x i32> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <4 x i32> @llvm.loongarch.lsx.vsubi.wu(<4 x i32> %va, i32 %b)
++  ret <4 x i32> %res
++}
++
++declare <2 x i64> @llvm.loongarch.lsx.vsubi.du(<2 x i64>, i32)
++
++define <2 x i64> @lsx_vsubi_du(<2 x i64> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <2 x i64> @llvm.loongarch.lsx.vsubi.du(<2 x i64> %va, i32 %b)
++  ret <2 x i64> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-xori-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-xori-invalid-imm.ll
+new file mode 100644
+index 000000000000..5f5613189ac8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-xori-invalid-imm.ll
+@@ -0,0 +1,17 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vxori.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vxori_b_lo(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vxori.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vxori.b(<16 x i8> %va, i32 -1)
++  ret <16 x i8> %res
++}
++
++define <16 x i8> @lsx_vxori_b_hi(<16 x i8> %va) nounwind {
++; CHECK: llvm.loongarch.lsx.vxori.b: argument out of range
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vxori.b(<16 x i8> %va, i32 256)
++  ret <16 x i8> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-xori-non-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-xori-non-imm.ll
+new file mode 100644
+index 000000000000..4238d89120f1
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-xori-non-imm.ll
+@@ -0,0 +1,10 @@
++; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s
++
++declare <16 x i8> @llvm.loongarch.lsx.vxori.b(<16 x i8>, i32)
++
++define <16 x i8> @lsx_vxori_b(<16 x i8> %va, i32 %b) nounwind {
++; CHECK: immarg operand has non-immediate parameter
++entry:
++  %res = call <16 x i8> @llvm.loongarch.lsx.vxori.b(<16 x i8> %va, i32 %b)
++  ret <16 x i8> %res
++}
+-- 
+2.20.1
+
+
+From 270c5590dbdb266e6df5bec97b38594824d9815a Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Wed, 23 Aug 2023 15:28:00 +0800
+Subject: [PATCH 10/35] [LoongArch] Reorder LoongArchTargetLowering(). NFC
+
+(cherry picked from commit 3693909ca47f1fafc97b441c91f5656acdd3907c)
+
+[LoongArch] Fix Subtarget.is64Bit
+
+(cherry picked from commit 749f36dae311000e1d69351707f4f24a72090c94)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       | 152 ++++++++++--------
+ 1 file changed, 82 insertions(+), 70 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index 3a40cd06a3eb..2f8ce57d3f5f 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -47,20 +47,14 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+     : TargetLowering(TM), Subtarget(STI) {
+ 
+   MVT GRLenVT = Subtarget.getGRLenVT();
++
+   // Set up the register classes.
++
+   addRegisterClass(GRLenVT, &LoongArch::GPRRegClass);
+   if (Subtarget.hasBasicF())
+     addRegisterClass(MVT::f32, &LoongArch::FPR32RegClass);
+   if (Subtarget.hasBasicD())
+     addRegisterClass(MVT::f64, &LoongArch::FPR64RegClass);
+-  if (Subtarget.hasExtLSX())
+-    for (auto VT : {MVT::v4f32, MVT::v2f64, MVT::v16i8, MVT::v8i16, MVT::v4i32,
+-                    MVT::v2i64})
+-      addRegisterClass(VT, &LoongArch::LSX128RegClass);
+-  if (Subtarget.hasExtLASX())
+-    for (auto VT : {MVT::v8f32, MVT::v4f64, MVT::v32i8, MVT::v16i16, MVT::v8i32,
+-                    MVT::v4i64})
+-      addRegisterClass(VT, &LoongArch::LASX256RegClass);
+ 
+   static const MVT::SimpleValueType LSXVTs[] = {
+       MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v4f32, MVT::v2f64};
+@@ -75,38 +69,57 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+     for (MVT VT : LASXVTs)
+       addRegisterClass(VT, &LoongArch::LASX256RegClass);
+ 
++  // Set operations for LA32 and LA64.
++
+   setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, GRLenVT,
+                    MVT::i1, Promote);
+ 
+-  // TODO: add necessary setOperationAction calls later.
+   setOperationAction(ISD::SHL_PARTS, GRLenVT, Custom);
+   setOperationAction(ISD::SRA_PARTS, GRLenVT, Custom);
+   setOperationAction(ISD::SRL_PARTS, GRLenVT, Custom);
+   setOperationAction(ISD::FP_TO_SINT, GRLenVT, Custom);
+   setOperationAction(ISD::ROTL, GRLenVT, Expand);
+   setOperationAction(ISD::CTPOP, GRLenVT, Expand);
+-  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+-  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+-  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+ 
+   setOperationAction({ISD::GlobalAddress, ISD::BlockAddress, ISD::ConstantPool,
+-                      ISD::JumpTable},
++                      ISD::JumpTable, ISD::GlobalTLSAddress},
+                      GRLenVT, Custom);
+ 
+-  setOperationAction(ISD::GlobalTLSAddress, GRLenVT, Custom);
+-
+-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+-
+-  setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
+-  if (Subtarget.is64Bit())
+-    setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);
++  setOperationAction(ISD::EH_DWARF_CFA, GRLenVT, Custom);
+ 
+   setOperationAction(ISD::DYNAMIC_STACKALLOC, GRLenVT, Expand);
+   setOperationAction({ISD::STACKSAVE, ISD::STACKRESTORE}, MVT::Other, Expand);
+   setOperationAction(ISD::VASTART, MVT::Other, Custom);
+   setOperationAction({ISD::VAARG, ISD::VACOPY, ISD::VAEND}, MVT::Other, Expand);
+ 
++  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
++  setOperationAction(ISD::TRAP, MVT::Other, Legal);
++
++  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
++  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
++  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
++
++  // Expand bitreverse.i16 with native-width bitrev and shift for now, before
++  // we get to know which of sll and revb.2h is faster.
++  setOperationAction(ISD::BITREVERSE, MVT::i8, Custom);
++  setOperationAction(ISD::BITREVERSE, GRLenVT, Legal);
++
++  // LA32 does not have REVB.2W and REVB.D due to the 64-bit operands, and
++  // the narrower REVB.W does not exist. But LA32 does have REVB.2H, so i16
++  // and i32 could still be byte-swapped relatively cheaply.
++  setOperationAction(ISD::BSWAP, MVT::i16, Custom);
++
++  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
++  setOperationAction(ISD::BR_CC, GRLenVT, Expand);
++  setOperationAction(ISD::SELECT_CC, GRLenVT, Expand);
++  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
++  setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, GRLenVT, Expand);
++
++  setOperationAction(ISD::FP_TO_UINT, GRLenVT, Custom);
++  setOperationAction(ISD::UINT_TO_FP, GRLenVT, Expand);
++
++  // Set operations for LA64 only.
++
+   if (Subtarget.is64Bit()) {
+     setOperationAction(ISD::SHL, MVT::i32, Custom);
+     setOperationAction(ISD::SRA, MVT::i32, Custom);
+@@ -117,50 +130,39 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+     setOperationAction(ISD::ROTL, MVT::i32, Custom);
+     setOperationAction(ISD::CTTZ, MVT::i32, Custom);
+     setOperationAction(ISD::CTLZ, MVT::i32, Custom);
+-    setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom);
+-    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i32, Custom);
+-    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
++    setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
+     setOperationAction(ISD::READ_REGISTER, MVT::i32, Custom);
+     setOperationAction(ISD::WRITE_REGISTER, MVT::i32, Custom);
++    setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom);
+     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i32, Custom);
+-    if (Subtarget.hasBasicF() && !Subtarget.hasBasicD())
+-      setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+-    if (Subtarget.hasBasicF())
+-      setOperationAction(ISD::FRINT, MVT::f32, Legal);
+-    if (Subtarget.hasBasicD())
+-      setOperationAction(ISD::FRINT, MVT::f64, Legal);
+-  }
++    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i32, Custom);
+ 
+-  // LA32 does not have REVB.2W and REVB.D due to the 64-bit operands, and
+-  // the narrower REVB.W does not exist. But LA32 does have REVB.2H, so i16
+-  // and i32 could still be byte-swapped relatively cheaply.
+-  setOperationAction(ISD::BSWAP, MVT::i16, Custom);
+-  if (Subtarget.is64Bit()) {
++    setOperationAction(ISD::BITREVERSE, MVT::i32, Custom);
+     setOperationAction(ISD::BSWAP, MVT::i32, Custom);
+   }
+ 
+-  // Expand bitreverse.i16 with native-width bitrev and shift for now, before
+-  // we get to know which of sll and revb.2h is faster.
+-  setOperationAction(ISD::BITREVERSE, MVT::i8, Custom);
+-  if (Subtarget.is64Bit()) {
+-    setOperationAction(ISD::BITREVERSE, MVT::i32, Custom);
+-    setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
+-  } else {
+-    setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+-    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
++  // Set operations for LA32 only.
++
++  if (!Subtarget.is64Bit()) {
+     setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
+     setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
+-    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+     setOperationAction(ISD::INTRINSIC_VOID, MVT::i64, Custom);
+     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
++    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
++
++    // Set libcalls.
++    setLibcallName(RTLIB::MUL_I128, nullptr);
+   }
+ 
+   static const ISD::CondCode FPCCToExpand[] = {
+       ISD::SETOGT, ISD::SETOGE, ISD::SETUGT, ISD::SETUGE,
+       ISD::SETGE,  ISD::SETNE,  ISD::SETGT};
+ 
++  // Set operations for 'F' feature.
++
+   if (Subtarget.hasBasicF()) {
+     setCondCodeAction(FPCCToExpand, MVT::f32, Expand);
++
+     setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+     setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+     setOperationAction(ISD::FMA, MVT::f32, Legal);
+@@ -173,14 +175,30 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+     setOperationAction(ISD::FPOW, MVT::f32, Expand);
+     setOperationAction(ISD::FREM, MVT::f32, Expand);
++
++    if (Subtarget.is64Bit())
++      setOperationAction(ISD::FRINT, MVT::f32, Legal);
++
++    if (!Subtarget.hasBasicD()) {
++      setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
++      if (Subtarget.is64Bit()) {
++        setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
++        setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
++      }
++    }
+   }
++
++  // Set operations for 'D' feature.
++
+   if (Subtarget.hasBasicD()) {
++    setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
++    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+     setCondCodeAction(FPCCToExpand, MVT::f64, Expand);
++
+     setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+     setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+     setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal);
+     setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Legal);
+-    setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+     setOperationAction(ISD::FMA, MVT::f64, Legal);
+     setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
+     setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
+@@ -189,35 +207,35 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+     setOperationAction(ISD::FPOW, MVT::f64, Expand);
+     setOperationAction(ISD::FREM, MVT::f64, Expand);
+-    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+-  }
+-
+-  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+ 
+-  setOperationAction(ISD::BR_CC, GRLenVT, Expand);
+-  setOperationAction(ISD::SELECT_CC, GRLenVT, Expand);
+-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+-  setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, GRLenVT, Expand);
+-  if (!Subtarget.is64Bit())
+-    setLibcallName(RTLIB::MUL_I128, nullptr);
+-
+-  setOperationAction(ISD::FP_TO_UINT, GRLenVT, Custom);
+-  setOperationAction(ISD::UINT_TO_FP, GRLenVT, Expand);
+-  if ((Subtarget.is64Bit() && Subtarget.hasBasicF() &&
+-       !Subtarget.hasBasicD())) {
+-    setOperationAction(ISD::SINT_TO_FP, GRLenVT, Custom);
+-    setOperationAction(ISD::UINT_TO_FP, GRLenVT, Custom);
++    if (Subtarget.is64Bit())
++      setOperationAction(ISD::FRINT, MVT::f64, Legal);
+   }
+ 
++  // Set operations for 'LSX' feature.
++
+   if (Subtarget.hasExtLSX())
+     setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN},
+                        {MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8}, Legal);
+ 
++  // Set operations for 'LASX' feature.
++
+   if (Subtarget.hasExtLASX())
+     setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN},
+                        {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8},
+                        Legal);
+ 
++  // Set DAG combine for LA32 and LA64.
++
++  setTargetDAGCombine(ISD::AND);
++  setTargetDAGCombine(ISD::OR);
++  setTargetDAGCombine(ISD::SRL);
++
++  // Set DAG combine for 'LSX' feature.
++
++  if (Subtarget.hasExtLSX())
++    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
++
+   // Compute derived properties from the register classes.
+   computeRegisterProperties(Subtarget.getRegisterInfo());
+ 
+@@ -235,12 +253,6 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+   setPrefFunctionAlignment(Subtarget.getPrefFunctionAlignment());
+   setPrefLoopAlignment(Subtarget.getPrefLoopAlignment());
+   setMaxBytesForAlignment(Subtarget.getMaxBytesForAlignment());
+-
+-  setTargetDAGCombine(ISD::AND);
+-  setTargetDAGCombine(ISD::OR);
+-  setTargetDAGCombine(ISD::SRL);
+-  if (Subtarget.hasExtLSX())
+-    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+ }
+ 
+ bool LoongArchTargetLowering::isOffsetFoldingLegal(
+-- 
+2.20.1
+
+
+From 9b554aa98f070e4fdbf2a76cca811db411ec3312 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Tue, 29 Aug 2023 19:16:20 +0800
+Subject: [PATCH 11/35] [LoongArch] Fix typos. NFC
+
+(cherry picked from commit 30b6b27385f8ddc550df54a097434a121ae56d12)
+---
+ .../LoongArch/LoongArchLASXInstrInfo.td       | 52 +++++++++----------
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td | 50 +++++++++---------
+ 2 files changed, 51 insertions(+), 51 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index a3afd4789dfc..947950be2b8f 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -1545,10 +1545,10 @@ foreach Inst = ["XVEXTH_Q_D", "XVEXTH_QU_DU", "XVMSKLTZ_D",
+ // Pat<(Intrinsic timm:$imm)
+ //     (LAInst timm:$imm)>;
+ def : Pat<(int_loongarch_lasx_xvldi timm:$imm),
+-          (XVLDI (to_valide_timm timm:$imm))>;
++          (XVLDI (to_valid_timm timm:$imm))>;
+ foreach Inst = ["XVREPLI_B", "XVREPLI_H", "XVREPLI_W", "XVREPLI_D"] in
+   def : Pat<(deriveLASXIntrinsic<Inst>.ret timm:$imm),
+-            (!cast<LAInst>("Pseudo"#Inst) (to_valide_timm timm:$imm))>;
++            (!cast<LAInst>("Pseudo"#Inst) (to_valid_timm timm:$imm))>;
+ 
+ // vty: v32i8/v16i16/v8i32/v4i64
+ // Pat<(Intrinsic vty:$xj, timm:$imm)
+@@ -1558,25 +1558,25 @@ foreach Inst = ["XVSAT_B", "XVSAT_BU", "XVNORI_B", "XVROTRI_B", "XVSLLWIL_H_B",
+                 "XVSEQI_B", "XVSLEI_B", "XVSLEI_BU", "XVSLTI_B", "XVSLTI_BU",
+                 "XVREPL128VEI_B", "XVBSLL_V", "XVBSRL_V", "XVSHUF4I_B"] in
+   def : Pat<(deriveLASXIntrinsic<Inst>.ret (v32i8 LASX256:$xj), timm:$imm),
+-            (!cast<LAInst>(Inst) LASX256:$xj, (to_valide_timm timm:$imm))>;
++            (!cast<LAInst>(Inst) LASX256:$xj, (to_valid_timm timm:$imm))>;
+ foreach Inst = ["XVSAT_H", "XVSAT_HU", "XVROTRI_H", "XVSLLWIL_W_H",
+                 "XVSLLWIL_WU_HU", "XVSRLRI_H", "XVSRARI_H",
+                 "XVSEQI_H", "XVSLEI_H", "XVSLEI_HU", "XVSLTI_H", "XVSLTI_HU",
+                 "XVREPL128VEI_H", "XVSHUF4I_H"] in
+   def : Pat<(deriveLASXIntrinsic<Inst>.ret (v16i16 LASX256:$xj), timm:$imm),
+-            (!cast<LAInst>(Inst) LASX256:$xj, (to_valide_timm timm:$imm))>;
++            (!cast<LAInst>(Inst) LASX256:$xj, (to_valid_timm timm:$imm))>;
+ foreach Inst = ["XVSAT_W", "XVSAT_WU", "XVROTRI_W", "XVSLLWIL_D_W",
+                 "XVSLLWIL_DU_WU", "XVSRLRI_W", "XVSRARI_W",
+                 "XVSEQI_W", "XVSLEI_W", "XVSLEI_WU", "XVSLTI_W", "XVSLTI_WU",
+                 "XVREPL128VEI_W", "XVSHUF4I_W", "XVPICKVE_W"] in
+   def : Pat<(deriveLASXIntrinsic<Inst>.ret (v8i32 LASX256:$xj), timm:$imm),
+-            (!cast<LAInst>(Inst) LASX256:$xj, (to_valide_timm timm:$imm))>;
++            (!cast<LAInst>(Inst) LASX256:$xj, (to_valid_timm timm:$imm))>;
+ foreach Inst = ["XVSAT_D", "XVSAT_DU", "XVROTRI_D", "XVSRLRI_D", "XVSRARI_D",
+                 "XVSEQI_D", "XVSLEI_D", "XVSLEI_DU", "XVSLTI_D", "XVSLTI_DU",
+                 "XVPICKVE2GR_D", "XVPICKVE2GR_DU",
+                 "XVREPL128VEI_D", "XVPERMI_D", "XVPICKVE_D"] in
+   def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4i64 LASX256:$xj), timm:$imm),
+-            (!cast<LAInst>(Inst) LASX256:$xj, (to_valide_timm timm:$imm))>;
++            (!cast<LAInst>(Inst) LASX256:$xj, (to_valid_timm timm:$imm))>;
+ 
+ // vty: v32i8/v16i16/v8i32/v4i64
+ // Pat<(Intrinsic vty:$xd, vty:$xj, timm:$imm)
+@@ -1588,7 +1588,7 @@ foreach Inst = ["XVSRLNI_B_H", "XVSRANI_B_H", "XVSRLRNI_B_H", "XVSRARNI_B_H",
+   def : Pat<(deriveLASXIntrinsic<Inst>.ret
+                (v32i8 LASX256:$xd), (v32i8 LASX256:$xj), timm:$imm),
+             (!cast<LAInst>(Inst) LASX256:$xd, LASX256:$xj,
+-               (to_valide_timm timm:$imm))>;
++               (to_valid_timm timm:$imm))>;
+ foreach Inst = ["XVSRLNI_H_W", "XVSRANI_H_W", "XVSRLRNI_H_W", "XVSRARNI_H_W",
+                 "XVSSRLNI_H_W", "XVSSRANI_H_W", "XVSSRLNI_HU_W", "XVSSRANI_HU_W",
+                 "XVSSRLRNI_H_W", "XVSSRARNI_H_W", "XVSSRLRNI_HU_W", "XVSSRARNI_HU_W",
+@@ -1596,7 +1596,7 @@ foreach Inst = ["XVSRLNI_H_W", "XVSRANI_H_W", "XVSRLRNI_H_W", "XVSRARNI_H_W",
+   def : Pat<(deriveLASXIntrinsic<Inst>.ret
+                (v16i16 LASX256:$xd), (v16i16 LASX256:$xj), timm:$imm),
+             (!cast<LAInst>(Inst) LASX256:$xd, LASX256:$xj,
+-               (to_valide_timm timm:$imm))>;
++               (to_valid_timm timm:$imm))>;
+ foreach Inst = ["XVSRLNI_W_D", "XVSRANI_W_D", "XVSRLRNI_W_D", "XVSRARNI_W_D",
+                 "XVSSRLNI_W_D", "XVSSRANI_W_D", "XVSSRLNI_WU_D", "XVSSRANI_WU_D",
+                 "XVSSRLRNI_W_D", "XVSSRARNI_W_D", "XVSSRLRNI_WU_D", "XVSSRARNI_WU_D",
+@@ -1604,7 +1604,7 @@ foreach Inst = ["XVSRLNI_W_D", "XVSRANI_W_D", "XVSRLRNI_W_D", "XVSRARNI_W_D",
+   def : Pat<(deriveLASXIntrinsic<Inst>.ret
+                (v8i32 LASX256:$xd), (v8i32 LASX256:$xj), timm:$imm),
+             (!cast<LAInst>(Inst) LASX256:$xd, LASX256:$xj,
+-               (to_valide_timm timm:$imm))>;
++               (to_valid_timm timm:$imm))>;
+ foreach Inst = ["XVSRLNI_D_Q", "XVSRANI_D_Q", "XVSRLRNI_D_Q", "XVSRARNI_D_Q",
+                 "XVSSRLNI_D_Q", "XVSSRANI_D_Q", "XVSSRLNI_DU_Q", "XVSSRANI_DU_Q",
+                 "XVSSRLRNI_D_Q", "XVSSRARNI_D_Q", "XVSSRLRNI_DU_Q", "XVSSRARNI_DU_Q",
+@@ -1612,7 +1612,7 @@ foreach Inst = ["XVSRLNI_D_Q", "XVSRANI_D_Q", "XVSRLRNI_D_Q", "XVSRARNI_D_Q",
+   def : Pat<(deriveLASXIntrinsic<Inst>.ret
+                (v4i64 LASX256:$xd), (v4i64 LASX256:$xj), timm:$imm),
+             (!cast<LAInst>(Inst) LASX256:$xd, LASX256:$xj,
+-               (to_valide_timm timm:$imm))>;
++               (to_valid_timm timm:$imm))>;
+ 
+ // vty: v32i8/v16i16/v8i32/v4i64
+ // Pat<(Intrinsic vty:$xd, vty:$xj, vty:$xk),
+@@ -1693,42 +1693,42 @@ foreach Inst = ["XVFLOGB_D", "XVFCLASS_D", "XVFSQRT_D", "XVFRECIP_D", "XVFRSQRT_
+             (!cast<LAInst>(Inst) LASX256:$xj)>;
+ 
+ def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm),
+-          (XVPICKVE_W v8f32:$xj, (to_valide_timm timm:$imm))>;
++          (XVPICKVE_W v8f32:$xj, (to_valid_timm timm:$imm))>;
+ def : Pat<(int_loongarch_lasx_xvpickve_d_f v4f64:$xj, timm:$imm),
+-          (XVPICKVE_D v4f64:$xj, (to_valide_timm timm:$imm))>;
++          (XVPICKVE_D v4f64:$xj, (to_valid_timm timm:$imm))>;
+ 
+ // load
+ def : Pat<(int_loongarch_lasx_xvld GPR:$rj, timm:$imm),
+-          (XVLD GPR:$rj, (to_valide_timm timm:$imm))>;
++          (XVLD GPR:$rj, (to_valid_timm timm:$imm))>;
+ def : Pat<(int_loongarch_lasx_xvldx GPR:$rj, GPR:$rk),
+           (XVLDX GPR:$rj, GPR:$rk)>;
+ 
+ def : Pat<(int_loongarch_lasx_xvldrepl_b GPR:$rj, timm:$imm),
+-          (XVLDREPL_B GPR:$rj, (to_valide_timm timm:$imm))>;
++          (XVLDREPL_B GPR:$rj, (to_valid_timm timm:$imm))>;
+ def : Pat<(int_loongarch_lasx_xvldrepl_h GPR:$rj, timm:$imm),
+-          (XVLDREPL_H GPR:$rj, (to_valide_timm timm:$imm))>;
++          (XVLDREPL_H GPR:$rj, (to_valid_timm timm:$imm))>;
+ def : Pat<(int_loongarch_lasx_xvldrepl_w GPR:$rj, timm:$imm),
+-          (XVLDREPL_W GPR:$rj, (to_valide_timm timm:$imm))>;
++          (XVLDREPL_W GPR:$rj, (to_valid_timm timm:$imm))>;
+ def : Pat<(int_loongarch_lasx_xvldrepl_d GPR:$rj, timm:$imm),
+-          (XVLDREPL_D GPR:$rj, (to_valide_timm timm:$imm))>;
++          (XVLDREPL_D GPR:$rj, (to_valid_timm timm:$imm))>;
+ 
+ // store
+ def : Pat<(int_loongarch_lasx_xvst LASX256:$xd, GPR:$rj, timm:$imm),
+-          (XVST LASX256:$xd, GPR:$rj, (to_valide_timm timm:$imm))>;
++          (XVST LASX256:$xd, GPR:$rj, (to_valid_timm timm:$imm))>;
+ def : Pat<(int_loongarch_lasx_xvstx LASX256:$xd, GPR:$rj, GPR:$rk),
+           (XVSTX LASX256:$xd, GPR:$rj, GPR:$rk)>;
+ 
+ def : Pat<(int_loongarch_lasx_xvstelm_b v32i8:$xd, GPR:$rj, timm:$imm, timm:$idx),
+-          (XVSTELM_B v32i8:$xd, GPR:$rj, (to_valide_timm timm:$imm),
+-                    (to_valide_timm timm:$idx))>;
++          (XVSTELM_B v32i8:$xd, GPR:$rj, (to_valid_timm timm:$imm),
++                    (to_valid_timm timm:$idx))>;
+ def : Pat<(int_loongarch_lasx_xvstelm_h v16i16:$xd, GPR:$rj, timm:$imm, timm:$idx),
+-          (XVSTELM_H v16i16:$xd, GPR:$rj, (to_valide_timm timm:$imm),
+-                    (to_valide_timm timm:$idx))>;
++          (XVSTELM_H v16i16:$xd, GPR:$rj, (to_valid_timm timm:$imm),
++                    (to_valid_timm timm:$idx))>;
+ def : Pat<(int_loongarch_lasx_xvstelm_w v8i32:$xd, GPR:$rj, timm:$imm, timm:$idx),
+-          (XVSTELM_W v8i32:$xd, GPR:$rj, (to_valide_timm timm:$imm),
+-                    (to_valide_timm timm:$idx))>;
++          (XVSTELM_W v8i32:$xd, GPR:$rj, (to_valid_timm timm:$imm),
++                    (to_valid_timm timm:$idx))>;
+ def : Pat<(int_loongarch_lasx_xvstelm_d v4i64:$xd, GPR:$rj, timm:$imm, timm:$idx),
+-          (XVSTELM_D v4i64:$xd, GPR:$rj, (to_valide_timm timm:$imm),
+-                    (to_valide_timm timm:$idx))>;
++          (XVSTELM_D v4i64:$xd, GPR:$rj, (to_valid_timm timm:$imm),
++                    (to_valid_timm timm:$idx))>;
+ 
+ } // Predicates = [HasExtLASX]
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index 13332be0bc38..e021adcecf4d 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -145,7 +145,7 @@ def lsxsplati32 : PatFrag<(ops node:$e0),
+ def lsxsplati64 : PatFrag<(ops node:$e0),
+                           (v2i64 (build_vector node:$e0, node:$e0))>;
+ 
+-def to_valide_timm : SDNodeXForm<timm, [{
++def to_valid_timm : SDNodeXForm<timm, [{
+   auto CN = cast<ConstantSDNode>(N);
+   return CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(N), Subtarget->getGRLenVT());
+ }]>;
+@@ -1639,10 +1639,10 @@ foreach Inst = ["VEXTH_Q_D", "VEXTH_QU_DU", "VMSKLTZ_D",
+ // Pat<(Intrinsic timm:$imm)
+ //     (LAInst timm:$imm)>;
+ def : Pat<(int_loongarch_lsx_vldi timm:$imm),
+-          (VLDI (to_valide_timm timm:$imm))>;
++          (VLDI (to_valid_timm timm:$imm))>;
+ foreach Inst = ["VREPLI_B", "VREPLI_H", "VREPLI_W", "VREPLI_D"] in
+   def : Pat<(deriveLSXIntrinsic<Inst>.ret timm:$imm),
+-            (!cast<LAInst>("Pseudo"#Inst) (to_valide_timm timm:$imm))>;
++            (!cast<LAInst>("Pseudo"#Inst) (to_valid_timm timm:$imm))>;
+ 
+ // vty: v16i8/v8i16/v4i32/v2i64
+ // Pat<(Intrinsic vty:$vj, timm:$imm)
+@@ -1652,25 +1652,25 @@ foreach Inst = ["VSAT_B", "VSAT_BU", "VNORI_B", "VROTRI_B", "VSLLWIL_H_B",
+                 "VSEQI_B", "VSLEI_B", "VSLEI_BU", "VSLTI_B", "VSLTI_BU",
+                 "VREPLVEI_B", "VBSLL_V", "VBSRL_V", "VSHUF4I_B"] in
+   def : Pat<(deriveLSXIntrinsic<Inst>.ret (v16i8 LSX128:$vj), timm:$imm),
+-            (!cast<LAInst>(Inst) LSX128:$vj, (to_valide_timm timm:$imm))>;
++            (!cast<LAInst>(Inst) LSX128:$vj, (to_valid_timm timm:$imm))>;
+ foreach Inst = ["VSAT_H", "VSAT_HU", "VROTRI_H", "VSLLWIL_W_H",
+                 "VSLLWIL_WU_HU", "VSRLRI_H", "VSRARI_H",
+                 "VSEQI_H", "VSLEI_H", "VSLEI_HU", "VSLTI_H", "VSLTI_HU",
+                 "VREPLVEI_H", "VSHUF4I_H"] in
+   def : Pat<(deriveLSXIntrinsic<Inst>.ret (v8i16 LSX128:$vj), timm:$imm),
+-            (!cast<LAInst>(Inst) LSX128:$vj, (to_valide_timm timm:$imm))>;
++            (!cast<LAInst>(Inst) LSX128:$vj, (to_valid_timm timm:$imm))>;
+ foreach Inst = ["VSAT_W", "VSAT_WU", "VROTRI_W", "VSLLWIL_D_W",
+                 "VSLLWIL_DU_WU", "VSRLRI_W", "VSRARI_W",
+                 "VSEQI_W", "VSLEI_W", "VSLEI_WU", "VSLTI_W", "VSLTI_WU",
+                 "VREPLVEI_W", "VSHUF4I_W"] in
+   def : Pat<(deriveLSXIntrinsic<Inst>.ret (v4i32 LSX128:$vj), timm:$imm),
+-            (!cast<LAInst>(Inst) LSX128:$vj, (to_valide_timm timm:$imm))>;
++            (!cast<LAInst>(Inst) LSX128:$vj, (to_valid_timm timm:$imm))>;
+ foreach Inst = ["VSAT_D", "VSAT_DU", "VROTRI_D", "VSRLRI_D", "VSRARI_D",
+                 "VSEQI_D", "VSLEI_D", "VSLEI_DU", "VSLTI_D", "VSLTI_DU",
+                 "VPICKVE2GR_D", "VPICKVE2GR_DU",
+                 "VREPLVEI_D"] in
+   def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2i64 LSX128:$vj), timm:$imm),
+-            (!cast<LAInst>(Inst) LSX128:$vj, (to_valide_timm timm:$imm))>;
++            (!cast<LAInst>(Inst) LSX128:$vj, (to_valid_timm timm:$imm))>;
+ 
+ // vty: v16i8/v8i16/v4i32/v2i64
+ // Pat<(Intrinsic vty:$vd, vty:$vj, timm:$imm)
+@@ -1682,7 +1682,7 @@ foreach Inst = ["VSRLNI_B_H", "VSRANI_B_H", "VSRLRNI_B_H", "VSRARNI_B_H",
+   def : Pat<(deriveLSXIntrinsic<Inst>.ret
+                (v16i8 LSX128:$vd), (v16i8 LSX128:$vj), timm:$imm),
+             (!cast<LAInst>(Inst) LSX128:$vd, LSX128:$vj,
+-               (to_valide_timm timm:$imm))>;
++               (to_valid_timm timm:$imm))>;
+ foreach Inst = ["VSRLNI_H_W", "VSRANI_H_W", "VSRLRNI_H_W", "VSRARNI_H_W",
+                 "VSSRLNI_H_W", "VSSRANI_H_W", "VSSRLNI_HU_W", "VSSRANI_HU_W",
+                 "VSSRLRNI_H_W", "VSSRARNI_H_W", "VSSRLRNI_HU_W", "VSSRARNI_HU_W",
+@@ -1690,7 +1690,7 @@ foreach Inst = ["VSRLNI_H_W", "VSRANI_H_W", "VSRLRNI_H_W", "VSRARNI_H_W",
+   def : Pat<(deriveLSXIntrinsic<Inst>.ret
+                (v8i16 LSX128:$vd), (v8i16 LSX128:$vj), timm:$imm),
+             (!cast<LAInst>(Inst) LSX128:$vd, LSX128:$vj,
+-               (to_valide_timm timm:$imm))>;
++               (to_valid_timm timm:$imm))>;
+ foreach Inst = ["VSRLNI_W_D", "VSRANI_W_D", "VSRLRNI_W_D", "VSRARNI_W_D",
+                 "VSSRLNI_W_D", "VSSRANI_W_D", "VSSRLNI_WU_D", "VSSRANI_WU_D",
+                 "VSSRLRNI_W_D", "VSSRARNI_W_D", "VSSRLRNI_WU_D", "VSSRARNI_WU_D",
+@@ -1698,7 +1698,7 @@ foreach Inst = ["VSRLNI_W_D", "VSRANI_W_D", "VSRLRNI_W_D", "VSRARNI_W_D",
+   def : Pat<(deriveLSXIntrinsic<Inst>.ret
+                (v4i32 LSX128:$vd), (v4i32 LSX128:$vj), timm:$imm),
+             (!cast<LAInst>(Inst) LSX128:$vd, LSX128:$vj,
+-               (to_valide_timm timm:$imm))>;
++               (to_valid_timm timm:$imm))>;
+ foreach Inst = ["VSRLNI_D_Q", "VSRANI_D_Q", "VSRLRNI_D_Q", "VSRARNI_D_Q",
+                 "VSSRLNI_D_Q", "VSSRANI_D_Q", "VSSRLNI_DU_Q", "VSSRANI_DU_Q",
+                 "VSSRLRNI_D_Q", "VSSRARNI_D_Q", "VSSRLRNI_DU_Q", "VSSRARNI_DU_Q",
+@@ -1706,7 +1706,7 @@ foreach Inst = ["VSRLNI_D_Q", "VSRANI_D_Q", "VSRLRNI_D_Q", "VSRARNI_D_Q",
+   def : Pat<(deriveLSXIntrinsic<Inst>.ret
+                (v2i64 LSX128:$vd), (v2i64 LSX128:$vj), timm:$imm),
+             (!cast<LAInst>(Inst) LSX128:$vd, LSX128:$vj,
+-               (to_valide_timm timm:$imm))>;
++               (to_valid_timm timm:$imm))>;
+ 
+ // vty: v16i8/v8i16/v4i32/v2i64
+ // Pat<(Intrinsic vty:$vd, vty:$vj, vty:$vk),
+@@ -1788,36 +1788,36 @@ foreach Inst = ["VFLOGB_D", "VFCLASS_D", "VFSQRT_D", "VFRECIP_D", "VFRSQRT_D",
+ 
+ // load
+ def : Pat<(int_loongarch_lsx_vld GPR:$rj, timm:$imm),
+-          (VLD GPR:$rj, (to_valide_timm timm:$imm))>;
++          (VLD GPR:$rj, (to_valid_timm timm:$imm))>;
+ def : Pat<(int_loongarch_lsx_vldx GPR:$rj, GPR:$rk),
+           (VLDX GPR:$rj, GPR:$rk)>;
+ 
+ def : Pat<(int_loongarch_lsx_vldrepl_b GPR:$rj, timm:$imm),
+-          (VLDREPL_B GPR:$rj, (to_valide_timm timm:$imm))>;
++          (VLDREPL_B GPR:$rj, (to_valid_timm timm:$imm))>;
+ def : Pat<(int_loongarch_lsx_vldrepl_h GPR:$rj, timm:$imm),
+-          (VLDREPL_H GPR:$rj, (to_valide_timm timm:$imm))>;
++          (VLDREPL_H GPR:$rj, (to_valid_timm timm:$imm))>;
+ def : Pat<(int_loongarch_lsx_vldrepl_w GPR:$rj, timm:$imm),
+-          (VLDREPL_W GPR:$rj, (to_valide_timm timm:$imm))>;
++          (VLDREPL_W GPR:$rj, (to_valid_timm timm:$imm))>;
+ def : Pat<(int_loongarch_lsx_vldrepl_d GPR:$rj, timm:$imm),
+-          (VLDREPL_D GPR:$rj, (to_valide_timm timm:$imm))>;
++          (VLDREPL_D GPR:$rj, (to_valid_timm timm:$imm))>;
+ 
+ // store
+ def : Pat<(int_loongarch_lsx_vst LSX128:$vd, GPR:$rj, timm:$imm),
+-          (VST LSX128:$vd, GPR:$rj, (to_valide_timm timm:$imm))>;
++          (VST LSX128:$vd, GPR:$rj, (to_valid_timm timm:$imm))>;
+ def : Pat<(int_loongarch_lsx_vstx LSX128:$vd, GPR:$rj, GPR:$rk),
+           (VSTX LSX128:$vd, GPR:$rj, GPR:$rk)>;
+ 
+ def : Pat<(int_loongarch_lsx_vstelm_b v16i8:$vd, GPR:$rj, timm:$imm, timm:$idx),
+-          (VSTELM_B v16i8:$vd, GPR:$rj, (to_valide_timm timm:$imm),
+-                    (to_valide_timm timm:$idx))>;
++          (VSTELM_B v16i8:$vd, GPR:$rj, (to_valid_timm timm:$imm),
++                    (to_valid_timm timm:$idx))>;
+ def : Pat<(int_loongarch_lsx_vstelm_h v8i16:$vd, GPR:$rj, timm:$imm, timm:$idx),
+-          (VSTELM_H v8i16:$vd, GPR:$rj, (to_valide_timm timm:$imm),
+-                    (to_valide_timm timm:$idx))>;
++          (VSTELM_H v8i16:$vd, GPR:$rj, (to_valid_timm timm:$imm),
++                    (to_valid_timm timm:$idx))>;
+ def : Pat<(int_loongarch_lsx_vstelm_w v4i32:$vd, GPR:$rj, timm:$imm, timm:$idx),
+-          (VSTELM_W v4i32:$vd, GPR:$rj, (to_valide_timm timm:$imm),
+-                    (to_valide_timm timm:$idx))>;
++          (VSTELM_W v4i32:$vd, GPR:$rj, (to_valid_timm timm:$imm),
++                    (to_valid_timm timm:$idx))>;
+ def : Pat<(int_loongarch_lsx_vstelm_d v2i64:$vd, GPR:$rj, timm:$imm, timm:$idx),
+-          (VSTELM_D v2i64:$vd, GPR:$rj, (to_valide_timm timm:$imm),
+-                    (to_valide_timm timm:$idx))>;
++          (VSTELM_D v2i64:$vd, GPR:$rj, (to_valid_timm timm:$imm),
++                    (to_valid_timm timm:$idx))>;
+ 
+ } // Predicates = [HasExtLSX]
+-- 
+2.20.1
+
+
+From 14892c2a03810b1e01aa62e8a5f12e4f4272bf23 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Tue, 24 Oct 2023 15:46:56 +0800
+Subject: [PATCH 12/35] [LoongArch] Set some operations action for LSX and LASX
+
+First, expand all truncationg stores and extending loads. Second,
+expand everything for `fixedlen_vector_valuetypes`.  Finally, we
+selectively turn on ones that can be effectively codegen'd.
+
+Simultaneously, this patch adds floating-point vector types to
+load/store patterns. Additional test cases will be included in the IR
+instruction test patchs.
+
+(cherry picked from commit f2441a06c609cedbb7e11303907f07bf0ca5cb2f)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       | 74 +++++++++++++++++--
+ .../LoongArch/LoongArchLASXInstrInfo.td       |  2 +-
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td |  2 +-
+ 3 files changed, 69 insertions(+), 9 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index 2f8ce57d3f5f..d3627cec2e8c 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -214,16 +214,76 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+ 
+   // Set operations for 'LSX' feature.
+ 
+-  if (Subtarget.hasExtLSX())
+-    setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN},
+-                       {MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8}, Legal);
++  if (Subtarget.hasExtLSX()) {
++    for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
++      // Expand all truncating stores and extending loads.
++      for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
++        setTruncStoreAction(VT, InnerVT, Expand);
++        setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
++        setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
++        setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
++      }
++      // By default everything must be expanded. Then we will selectively turn
++      // on ones that can be effectively codegen'd.
++      for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
++        setOperationAction(Op, VT, Expand);
++    }
++
++    for (MVT VT : LSXVTs) {
++      setOperationAction({ISD::LOAD, ISD::STORE}, VT, Legal);
++      setOperationAction(ISD::BITCAST, VT, Legal);
++      setOperationAction(ISD::UNDEF, VT, Legal);
++
++      // FIXME: For BUILD_VECTOR, it is temporarily set to `Legal` here, and it
++      // will be `Custom` handled in the future.
++      setOperationAction(ISD::BUILD_VECTOR, VT, Legal);
++      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Legal);
++    }
++    for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
++      setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
++      setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT,
++                         Legal);
++      setOperationAction({ISD::MUL, ISD::SDIV, ISD::SREM, ISD::UDIV, ISD::UREM},
++                         VT, Legal);
++      setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, VT, Legal);
++      setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL}, VT, Legal);
++      setOperationAction(ISD::CTPOP, VT, Legal);
++    }
++    for (MVT VT : {MVT::v4f32, MVT::v2f64}) {
++      setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
++      setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal);
++      setOperationAction(ISD::FMA, VT, Legal);
++    }
++  }
+ 
+   // Set operations for 'LASX' feature.
+ 
+-  if (Subtarget.hasExtLASX())
+-    setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN},
+-                       {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8},
+-                       Legal);
++  if (Subtarget.hasExtLASX()) {
++    for (MVT VT : LASXVTs) {
++      setOperationAction({ISD::LOAD, ISD::STORE}, VT, Legal);
++      setOperationAction(ISD::BITCAST, VT, Legal);
++      setOperationAction(ISD::UNDEF, VT, Legal);
++
++      // FIXME: Same as above.
++      setOperationAction(ISD::BUILD_VECTOR, VT, Legal);
++      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Legal);
++    }
++    for (MVT VT : {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8}) {
++      setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
++      setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT,
++                         Legal);
++      setOperationAction({ISD::MUL, ISD::SDIV, ISD::SREM, ISD::UDIV, ISD::UREM},
++                         VT, Legal);
++      setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, VT, Legal);
++      setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL}, VT, Legal);
++      setOperationAction(ISD::CTPOP, VT, Legal);
++    }
++    for (MVT VT : {MVT::v8f32, MVT::v4f64}) {
++      setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
++      setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal);
++      setOperationAction(ISD::FMA, VT, Legal);
++    }
++  }
+ 
+   // Set DAG combine for LA32 and LA64.
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index 947950be2b8f..e19aa92266b1 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -1394,7 +1394,7 @@ def : Pat<(loongarch_vreplve v4i64:$xj, GRLenVT:$rk),
+           (XVREPLVE_D v4i64:$xj, GRLenVT:$rk)>;
+ 
+ // Loads/Stores
+-foreach vt = [v32i8, v16i16, v8i32, v4i64] in {
++foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in {
+   defm : LdPat<load, XVLD, vt>;
+   def  : RegRegLdPat<load, XVLDX, vt>;
+   defm : StPat<store, XVST, LASX256, vt>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index e021adcecf4d..9391b1a8a20c 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -1494,7 +1494,7 @@ def : Pat<(loongarch_vreplve v2i64:$vj, GRLenVT:$rk),
+           (VREPLVE_D v2i64:$vj, GRLenVT:$rk)>;
+ 
+ // Loads/Stores
+-foreach vt = [v16i8, v8i16, v4i32, v2i64] in {
++foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
+   defm : LdPat<load, VLD, vt>;
+   def  : RegRegLdPat<load, VLDX, vt>;
+   defm : StPat<store, VST, LSX128, vt>;
+-- 
+2.20.1
+
+
+From 85d34e0b7e9947dda7ea981aa1dc10714fd44de5 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Thu, 30 Nov 2023 17:29:18 +0800
+Subject: [PATCH 13/35] [LoongArch] Add codegen support for extractelement
+ (#73759)
+
+Add codegen support for extractelement when enable `lsx` or `lasx`
+feature.
+
+(cherry picked from commit b72456120f1db38ed7068fb592fcf768c6d5cce2)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       |   2 +
+ .../Target/LoongArch/LoongArchInstrInfo.cpp   |   8 +
+ .../LoongArch/LoongArchLASXInstrInfo.td       |  38 ++++
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td |  38 ++++
+ .../lasx/ir-instruction/extractelement.ll     | 172 ++++++++++++++++++
+ .../lsx/ir-instruction/extractelement.ll      | 170 +++++++++++++++++
+ 6 files changed, 428 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/extractelement.ll
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index d3627cec2e8c..26e94a53b344 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -238,6 +238,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       // will be `Custom` handled in the future.
+       setOperationAction(ISD::BUILD_VECTOR, VT, Legal);
+       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Legal);
++      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
+     }
+     for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
+@@ -267,6 +268,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       // FIXME: Same as above.
+       setOperationAction(ISD::BUILD_VECTOR, VT, Legal);
+       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Legal);
++      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
+     }
+     for (MVT VT : {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8}) {
+       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
+diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+index ddd1c9943fac..6576100d3b32 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp
+@@ -90,6 +90,14 @@ void LoongArchInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+     Opc = LoongArch::FMOV_S;
+   } else if (LoongArch::FPR64RegClass.contains(DstReg, SrcReg)) {
+     Opc = LoongArch::FMOV_D;
++  } else if (LoongArch::GPRRegClass.contains(DstReg) &&
++             LoongArch::FPR32RegClass.contains(SrcReg)) {
++    // FPR32 -> GPR copies
++    Opc = LoongArch::MOVFR2GR_S;
++  } else if (LoongArch::GPRRegClass.contains(DstReg) &&
++             LoongArch::FPR64RegClass.contains(SrcReg)) {
++    // FPR64 -> GPR copies
++    Opc = LoongArch::MOVFR2GR_D;
+   } else {
+     // TODO: support other copies.
+     llvm_unreachable("Impossible reg-to-reg copy");
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index e19aa92266b1..380206ddcf10 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -1401,6 +1401,44 @@ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in {
+   def  : RegRegStPat<store, XVSTX, LASX256, vt>;
+ }
+ 
++// Vector extraction with constant index.
++def : Pat<(i64 (vector_extract v32i8:$xj, uimm4:$imm)),
++          (VPICKVE2GR_B (EXTRACT_SUBREG v32i8:$xj, sub_128), uimm4:$imm)>;
++def : Pat<(i64 (vector_extract v16i16:$xj, uimm3:$imm)),
++          (VPICKVE2GR_H (EXTRACT_SUBREG v16i16:$xj, sub_128), uimm3:$imm)>;
++def : Pat<(i64 (vector_extract v8i32:$xj, uimm2:$imm)),
++          (VPICKVE2GR_W (EXTRACT_SUBREG v8i32:$xj, sub_128), uimm2:$imm)>;
++def : Pat<(i64 (vector_extract v4i64:$xj, uimm1:$imm)),
++          (VPICKVE2GR_D (EXTRACT_SUBREG v4i64:$xj, sub_128), uimm1:$imm)>;
++def : Pat<(f32 (vector_extract v8f32:$xj, uimm2:$imm)),
++          (f32 (EXTRACT_SUBREG (XVREPL128VEI_W v8f32:$xj, uimm2:$imm), sub_32))>;
++def : Pat<(f64 (vector_extract v4f64:$xj, uimm1:$imm)),
++          (f64 (EXTRACT_SUBREG (XVREPL128VEI_D v4f64:$xj, uimm1:$imm), sub_64))>;
++
++// Vector extraction with variable index.
++def : Pat<(i64 (vector_extract v32i8:$xj, i64:$rk)),
++          (SRAI_W (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG (XVREPLVE_B v32i8:$xj,
++                                                                    i64:$rk),
++                                                         sub_32)),
++                                    GPR), (i64 24))>;
++def : Pat<(i64 (vector_extract v16i16:$xj, i64:$rk)),
++          (SRAI_W (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG (XVREPLVE_H v16i16:$xj,
++                                                                    i64:$rk),
++                                                         sub_32)),
++                                    GPR), (i64 16))>;
++def : Pat<(i64 (vector_extract v8i32:$xj, i64:$rk)),
++          (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG (XVREPLVE_W v8i32:$xj, i64:$rk),
++                                                 sub_32)),
++                            GPR)>;
++def : Pat<(i64 (vector_extract v4i64:$xj, i64:$rk)),
++          (COPY_TO_REGCLASS (f64 (EXTRACT_SUBREG (XVREPLVE_D v4i64:$xj, i64:$rk),
++                                                 sub_64)),
++                            GPR)>;
++def : Pat<(f32 (vector_extract v8f32:$xj, i64:$rk)),
++          (f32 (EXTRACT_SUBREG (XVREPLVE_W v8f32:$xj, i64:$rk), sub_32))>;
++def : Pat<(f64 (vector_extract v4f64:$xj, i64:$rk)),
++          (f64 (EXTRACT_SUBREG (XVREPLVE_D v4f64:$xj, i64:$rk), sub_64))>;
++
+ } // Predicates = [HasExtLASX]
+ 
+ /// Intrinsic pattern
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index 9391b1a8a20c..980870e34503 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -1501,6 +1501,44 @@ foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
+   def  : RegRegStPat<store, VSTX, LSX128, vt>;
+ }
+ 
++// Vector extraction with constant index.
++def : Pat<(i64 (vector_extract v16i8:$vj, uimm4:$imm)),
++          (VPICKVE2GR_B v16i8:$vj, uimm4:$imm)>;
++def : Pat<(i64 (vector_extract v8i16:$vj, uimm3:$imm)),
++          (VPICKVE2GR_H v8i16:$vj, uimm3:$imm)>;
++def : Pat<(i64 (vector_extract v4i32:$vj, uimm2:$imm)),
++          (VPICKVE2GR_W v4i32:$vj, uimm2:$imm)>;
++def : Pat<(i64 (vector_extract v2i64:$vj, uimm1:$imm)),
++          (VPICKVE2GR_D v2i64:$vj, uimm1:$imm)>;
++def : Pat<(f32 (vector_extract v4f32:$vj, uimm2:$imm)),
++          (f32 (EXTRACT_SUBREG (VREPLVEI_W v4f32:$vj, uimm2:$imm), sub_32))>;
++def : Pat<(f64 (vector_extract v2f64:$vj, uimm1:$imm)),
++          (f64 (EXTRACT_SUBREG (VREPLVEI_D v2f64:$vj, uimm1:$imm), sub_64))>;
++
++// Vector extraction with variable index.
++def : Pat<(i64 (vector_extract v16i8:$vj, i64:$rk)),
++          (SRAI_W (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG (VREPLVE_B v16i8:$vj,
++                                                                    i64:$rk),
++                                                         sub_32)),
++                                    GPR), (i64 24))>;
++def : Pat<(i64 (vector_extract v8i16:$vj, i64:$rk)),
++          (SRAI_W (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG (VREPLVE_H v8i16:$vj,
++                                                                    i64:$rk),
++                                                         sub_32)),
++                                    GPR), (i64 16))>;
++def : Pat<(i64 (vector_extract v4i32:$vj, i64:$rk)),
++          (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG (VREPLVE_W v4i32:$vj, i64:$rk),
++                                                 sub_32)),
++                            GPR)>;
++def : Pat<(i64 (vector_extract v2i64:$vj, i64:$rk)),
++          (COPY_TO_REGCLASS (f64 (EXTRACT_SUBREG (VREPLVE_D v2i64:$vj, i64:$rk),
++                                                 sub_64)),
++                            GPR)>;
++def : Pat<(f32 (vector_extract v4f32:$vj, i64:$rk)),
++          (f32 (EXTRACT_SUBREG (VREPLVE_W v4f32:$vj, i64:$rk), sub_32))>;
++def : Pat<(f64 (vector_extract v2f64:$vj, i64:$rk)),
++          (f64 (EXTRACT_SUBREG (VREPLVE_D v2f64:$vj, i64:$rk), sub_64))>;
++
+ } // Predicates = [HasExtLSX]
+ 
+ /// Intrinsic pattern
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
+new file mode 100644
+index 000000000000..78f584cd09a8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
+@@ -0,0 +1,172 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @extract_32xi8(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: extract_32xi8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    vpickve2gr.b $a0, $vr0, 1
++; CHECK-NEXT:    st.b $a0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <32 x i8>, ptr %src
++  %e = extractelement <32 x i8> %v, i32 1
++  store i8 %e, ptr %dst
++  ret void
++}
++
++define void @extract_16xi16(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: extract_16xi16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    vpickve2gr.h $a0, $vr0, 1
++; CHECK-NEXT:    st.h $a0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <16 x i16>, ptr %src
++  %e = extractelement <16 x i16> %v, i32 1
++  store i16 %e, ptr %dst
++  ret void
++}
++
++define void @extract_8xi32(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: extract_8xi32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 1
++; CHECK-NEXT:    st.w $a0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <8 x i32>, ptr %src
++  %e = extractelement <8 x i32> %v, i32 1
++  store i32 %e, ptr %dst
++  ret void
++}
++
++define void @extract_4xi64(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: extract_4xi64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
++; CHECK-NEXT:    st.d $a0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <4 x i64>, ptr %src
++  %e = extractelement <4 x i64> %v, i32 1
++  store i64 %e, ptr %dst
++  ret void
++}
++
++define void @extract_8xfloat(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: extract_8xfloat:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    ori $a0, $zero, 7
++; CHECK-NEXT:    xvreplve.w $xr0, $xr0, $a0
++; CHECK-NEXT:    fst.s $fa0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <8 x float>, ptr %src
++  %e = extractelement <8 x float> %v, i32 7
++  store float %e, ptr %dst
++  ret void
++}
++
++define void @extract_4xdouble(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: extract_4xdouble:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    ori $a0, $zero, 3
++; CHECK-NEXT:    xvreplve.d $xr0, $xr0, $a0
++; CHECK-NEXT:    fst.d $fa0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <4 x double>, ptr %src
++  %e = extractelement <4 x double> %v, i32 3
++  store double %e, ptr %dst
++  ret void
++}
++
++define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
++; CHECK-LABEL: extract_32xi8_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvreplve.b $xr0, $xr0, $a2
++; CHECK-NEXT:    movfr2gr.s $a0, $fa0
++; CHECK-NEXT:    srai.w $a0, $a0, 24
++; CHECK-NEXT:    st.b $a0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <32 x i8>, ptr %src
++  %e = extractelement <32 x i8> %v, i32 %idx
++  store i8 %e, ptr %dst
++  ret void
++}
++
++define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
++; CHECK-LABEL: extract_16xi16_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvreplve.h $xr0, $xr0, $a2
++; CHECK-NEXT:    movfr2gr.s $a0, $fa0
++; CHECK-NEXT:    srai.w $a0, $a0, 16
++; CHECK-NEXT:    st.h $a0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <16 x i16>, ptr %src
++  %e = extractelement <16 x i16> %v, i32 %idx
++  store i16 %e, ptr %dst
++  ret void
++}
++
++define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
++; CHECK-LABEL: extract_8xi32_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvreplve.w $xr0, $xr0, $a2
++; CHECK-NEXT:    movfr2gr.s $a0, $fa0
++; CHECK-NEXT:    st.w $a0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <8 x i32>, ptr %src
++  %e = extractelement <8 x i32> %v, i32 %idx
++  store i32 %e, ptr %dst
++  ret void
++}
++
++define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
++; CHECK-LABEL: extract_4xi64_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvreplve.d $xr0, $xr0, $a2
++; CHECK-NEXT:    movfr2gr.d $a0, $fa0
++; CHECK-NEXT:    st.d $a0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <4 x i64>, ptr %src
++  %e = extractelement <4 x i64> %v, i32 %idx
++  store i64 %e, ptr %dst
++  ret void
++}
++
++define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
++; CHECK-LABEL: extract_8xfloat_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvreplve.w $xr0, $xr0, $a2
++; CHECK-NEXT:    fst.s $fa0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <8 x float>, ptr %src
++  %e = extractelement <8 x float> %v, i32 %idx
++  store float %e, ptr %dst
++  ret void
++}
++
++define void @extract_4xdouble_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
++; CHECK-LABEL: extract_4xdouble_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvreplve.d $xr0, $xr0, $a2
++; CHECK-NEXT:    fst.d $fa0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <4 x double>, ptr %src
++  %e = extractelement <4 x double> %v, i32 %idx
++  store double %e, ptr %dst
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/extractelement.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/extractelement.ll
+new file mode 100644
+index 000000000000..b8798c97861e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/extractelement.ll
+@@ -0,0 +1,170 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @extract_16xi8(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: extract_16xi8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vpickve2gr.b $a0, $vr0, 1
++; CHECK-NEXT:    st.b $a0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <16 x i8>, ptr %src
++  %e = extractelement <16 x i8> %v, i32 1
++  store i8 %e, ptr %dst
++  ret void
++}
++
++define void @extract_8xi16(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: extract_8xi16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vpickve2gr.h $a0, $vr0, 1
++; CHECK-NEXT:    st.h $a0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <8 x i16>, ptr %src
++  %e = extractelement <8 x i16> %v, i32 1
++  store i16 %e, ptr %dst
++  ret void
++}
++
++define void @extract_4xi32(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: extract_4xi32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 1
++; CHECK-NEXT:    st.w $a0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <4 x i32>, ptr %src
++  %e = extractelement <4 x i32> %v, i32 1
++  store i32 %e, ptr %dst
++  ret void
++}
++
++define void @extract_2xi64(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: extract_2xi64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
++; CHECK-NEXT:    st.d $a0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <2 x i64>, ptr %src
++  %e = extractelement <2 x i64> %v, i32 1
++  store i64 %e, ptr %dst
++  ret void
++}
++
++define void @extract_4xfloat(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: extract_4xfloat:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 1
++; CHECK-NEXT:    fst.s $fa0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <4 x float>, ptr %src
++  %e = extractelement <4 x float> %v, i32 1
++  store float %e, ptr %dst
++  ret void
++}
++
++define void @extract_2xdouble(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: extract_2xdouble:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
++; CHECK-NEXT:    fst.d $fa0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <2 x double>, ptr %src
++  %e = extractelement <2 x double> %v, i32 1
++  store double %e, ptr %dst
++  ret void
++}
++
++define void @extract_16xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
++; CHECK-LABEL: extract_16xi8_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vreplve.b $vr0, $vr0, $a2
++; CHECK-NEXT:    movfr2gr.s $a0, $fa0
++; CHECK-NEXT:    srai.w $a0, $a0, 24
++; CHECK-NEXT:    st.b $a0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <16 x i8>, ptr %src
++  %e = extractelement <16 x i8> %v, i32 %idx
++  store i8 %e, ptr %dst
++  ret void
++}
++
++define void @extract_8xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
++; CHECK-LABEL: extract_8xi16_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vreplve.h $vr0, $vr0, $a2
++; CHECK-NEXT:    movfr2gr.s $a0, $fa0
++; CHECK-NEXT:    srai.w $a0, $a0, 16
++; CHECK-NEXT:    st.h $a0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <8 x i16>, ptr %src
++  %e = extractelement <8 x i16> %v, i32 %idx
++  store i16 %e, ptr %dst
++  ret void
++}
++
++define void @extract_4xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
++; CHECK-LABEL: extract_4xi32_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vreplve.w $vr0, $vr0, $a2
++; CHECK-NEXT:    movfr2gr.s $a0, $fa0
++; CHECK-NEXT:    st.w $a0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <4 x i32>, ptr %src
++  %e = extractelement <4 x i32> %v, i32 %idx
++  store i32 %e, ptr %dst
++  ret void
++}
++
++define void @extract_2xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
++; CHECK-LABEL: extract_2xi64_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vreplve.d $vr0, $vr0, $a2
++; CHECK-NEXT:    movfr2gr.d $a0, $fa0
++; CHECK-NEXT:    st.d $a0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <2 x i64>, ptr %src
++  %e = extractelement <2 x i64> %v, i32 %idx
++  store i64 %e, ptr %dst
++  ret void
++}
++
++define void @extract_4xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
++; CHECK-LABEL: extract_4xfloat_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vreplve.w $vr0, $vr0, $a2
++; CHECK-NEXT:    fst.s $fa0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <4 x float>, ptr %src
++  %e = extractelement <4 x float> %v, i32 %idx
++  store float %e, ptr %dst
++  ret void
++}
++
++define void @extract_2xdouble_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
++; CHECK-LABEL: extract_2xdouble_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vreplve.d $vr0, $vr0, $a2
++; CHECK-NEXT:    fst.d $fa0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <2 x double>, ptr %src
++  %e = extractelement <2 x double> %v, i32 %idx
++  store double %e, ptr %dst
++  ret void
++}
+-- 
+2.20.1
+
+
+From eb1dc17f9111c2bf2d20d366a9b46c4bda0606f6 Mon Sep 17 00:00:00 2001
+From: leecheechen <chenli@loongson.cn>
+Date: Thu, 30 Nov 2023 21:41:18 +0800
+Subject: [PATCH 14/35] [LoongArch] Add some binary IR instructions testcases
+ for LSX (#73929)
+
+The IR instructions include:
+- Binary Operations: add fadd sub fsub mul fmul udiv sdiv fdiv
+- Bitwise Binary Operations: shl lshr ashr
+
+(cherry picked from commit 29a0f3ec2b47630ce229953fe7250e741b6c10b6)
+---
+ .../LoongArch/lsx/ir-instruction/add.ll       | 122 +++++++++
+ .../LoongArch/lsx/ir-instruction/ashr.ll      | 178 +++++++++++++
+ .../LoongArch/lsx/ir-instruction/fadd.ll      |  34 +++
+ .../LoongArch/lsx/ir-instruction/fdiv.ll      |  34 +++
+ .../LoongArch/lsx/ir-instruction/fmul.ll      |  34 +++
+ .../LoongArch/lsx/ir-instruction/fsub.ll      |  34 +++
+ .../LoongArch/lsx/ir-instruction/lshr.ll      | 178 +++++++++++++
+ .../LoongArch/lsx/ir-instruction/mul.ll       | 242 ++++++++++++++++++
+ .../LoongArch/lsx/ir-instruction/sdiv.ll      | 134 ++++++++++
+ .../LoongArch/lsx/ir-instruction/shl.ll       | 178 +++++++++++++
+ .../LoongArch/lsx/ir-instruction/sub.ll       | 122 +++++++++
+ .../LoongArch/lsx/ir-instruction/udiv.ll      | 122 +++++++++
+ 12 files changed, 1412 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/add.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/ashr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fadd.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fdiv.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fmul.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fsub.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/lshr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mul.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sdiv.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shl.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sub.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/udiv.ll
+
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/add.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/add.ll
+new file mode 100644
+index 000000000000..2a7c37c2ae34
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/add.ll
+@@ -0,0 +1,122 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @add_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: add_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vadd.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %v2 = add <16 x i8> %v0, %v1
++  store <16 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @add_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: add_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vadd.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %v2 = add <8 x i16> %v0, %v1
++  store <8 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @add_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: add_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vadd.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %v2 = add <4 x i32> %v0, %v1
++  store <4 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @add_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: add_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vadd.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %v2 = add <2 x i64> %v0, %v1
++  store <2 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @add_v16i8_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: add_v16i8_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vaddi.bu $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = add <16 x i8> %v0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
++  store <16 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @add_v8i16_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: add_v8i16_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vaddi.hu $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = add <8 x i16> %v0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
++  store <8 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @add_v4i32_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: add_v4i32_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vaddi.wu $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = add <4 x i32> %v0, <i32 31, i32 31, i32 31, i32 31>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @add_v2i64_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: add_v2i64_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vaddi.du $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = add <2 x i64> %v0, <i64 31, i64 31>
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/ashr.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/ashr.ll
+new file mode 100644
+index 000000000000..fbc570d77ba8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/ashr.ll
+@@ -0,0 +1,178 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @ashr_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: ashr_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsra.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %v2 = ashr <16 x i8> %v0, %v1
++  store <16 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @ashr_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: ashr_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsra.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %v2 = ashr <8 x i16> %v0, %v1
++  store <8 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @ashr_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: ashr_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsra.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %v2 = ashr <4 x i32> %v0, %v1
++  store <4 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @ashr_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: ashr_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsra.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %v2 = ashr <2 x i64> %v0, %v1
++  store <2 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @ashr_v16i8_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: ashr_v16i8_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrai.b $vr0, $vr0, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = ashr <16 x i8> %v0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
++  store <16 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @ashr_v16i8_7(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: ashr_v16i8_7:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrai.b $vr0, $vr0, 7
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = ashr <16 x i8> %v0, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
++  store <16 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @ashr_v8i16_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: ashr_v8i16_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrai.h $vr0, $vr0, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = ashr <8 x i16> %v0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
++  store <8 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @ashr_v8i16_15(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: ashr_v8i16_15:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrai.h $vr0, $vr0, 15
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = ashr <8 x i16> %v0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
++  store <8 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @ashr_v4i32_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: ashr_v4i32_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrai.w $vr0, $vr0, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = ashr <4 x i32> %v0, <i32 1, i32 1, i32 1, i32 1>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @ashr_v4i32_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: ashr_v4i32_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrai.w $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = ashr <4 x i32> %v0, <i32 31, i32 31, i32 31, i32 31>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @ashr_v2i64_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: ashr_v2i64_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrai.d $vr0, $vr0, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = ashr <2 x i64> %v0, <i64 1, i64 1>
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
++
++define void @ashr_v2i64_63(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: ashr_v2i64_63:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrai.d $vr0, $vr0, 63
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = ashr <2 x i64> %v0, <i64 63, i64 63>
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fadd.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fadd.ll
+new file mode 100644
+index 000000000000..1fa1f611c4a3
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fadd.ll
+@@ -0,0 +1,34 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @fadd_v4f32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: fadd_v4f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfadd.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = fadd <4 x float> %v0, %v1
++  store <4 x float> %v2, ptr %res
++  ret void
++}
++
++define void @fadd_v2f64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: fadd_v2f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfadd.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = fadd <2 x double> %v0, %v1
++  store <2 x double> %v2, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fdiv.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fdiv.ll
+new file mode 100644
+index 000000000000..eb7c8bd9616e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fdiv.ll
+@@ -0,0 +1,34 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @fdiv_v4f32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: fdiv_v4f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfdiv.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = fdiv <4 x float> %v0, %v1
++  store <4 x float> %v2, ptr %res
++  ret void
++}
++
++define void @fdiv_v2f64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: fdiv_v2f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfdiv.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = fdiv <2 x double> %v0, %v1
++  store <2 x double> %v2, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fmul.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fmul.ll
+new file mode 100644
+index 000000000000..e7fb527f7805
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fmul.ll
+@@ -0,0 +1,34 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @fmul_v4f32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: fmul_v4f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfmul.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = fmul <4 x float> %v0, %v1
++  store <4 x float> %v2, ptr %res
++  ret void
++}
++
++define void @fmul_v2f64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: fmul_v2f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfmul.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = fmul <2 x double> %v0, %v1
++  store <2 x double> %v2, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fsub.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fsub.ll
+new file mode 100644
+index 000000000000..df98182321da
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fsub.ll
+@@ -0,0 +1,34 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @fsub_v4f32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: fsub_v4f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfsub.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = fsub <4 x float> %v0, %v1
++  store <4 x float> %v2, ptr %res
++  ret void
++}
++
++define void @fsub_v2f64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: fsub_v2f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfsub.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = fsub <2 x double> %v0, %v1
++  store <2 x double> %v2, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/lshr.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/lshr.ll
+new file mode 100644
+index 000000000000..dada52f93060
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/lshr.ll
+@@ -0,0 +1,178 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @lshr_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: lshr_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsrl.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %v2 = lshr <16 x i8> %v0, %v1
++  store <16 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @lshr_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: lshr_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsrl.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %v2 = lshr <8 x i16> %v0, %v1
++  store <8 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @lshr_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: lshr_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsrl.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %v2 = lshr <4 x i32> %v0, %v1
++  store <4 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @lshr_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: lshr_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsrl.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %v2 = lshr <2 x i64> %v0, %v1
++  store <2 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @lshr_v16i8_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: lshr_v16i8_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrli.b $vr0, $vr0, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = lshr <16 x i8> %v0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
++  store <16 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @lshr_v16i8_7(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: lshr_v16i8_7:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrli.b $vr0, $vr0, 7
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = lshr <16 x i8> %v0, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
++  store <16 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @lshr_v8i16_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: lshr_v8i16_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrli.h $vr0, $vr0, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = lshr <8 x i16> %v0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
++  store <8 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @lshr_v8i16_15(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: lshr_v8i16_15:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrli.h $vr0, $vr0, 15
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = lshr <8 x i16> %v0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
++  store <8 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @lshr_v4i32_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: lshr_v4i32_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrli.w $vr0, $vr0, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = lshr <4 x i32> %v0, <i32 1, i32 1, i32 1, i32 1>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @lshr_v4i32_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: lshr_v4i32_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrli.w $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = lshr <4 x i32> %v0, <i32 31, i32 31, i32 31, i32 31>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @lshr_v2i64_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: lshr_v2i64_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrli.d $vr0, $vr0, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = lshr <2 x i64> %v0, <i64 1, i64 1>
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
++
++define void @lshr_v2i64_63(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: lshr_v2i64_63:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrli.d $vr0, $vr0, 63
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = lshr <2 x i64> %v0, <i64 63, i64 63>
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mul.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mul.ll
+new file mode 100644
+index 000000000000..5060240cd8b1
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mul.ll
+@@ -0,0 +1,242 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @mul_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mul_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vmul.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %v2 = mul <16 x i8> %v0, %v1
++  store <16 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @mul_v8i16(ptr %res, ptr %a0, ptr %a1)  nounwind {
++; CHECK-LABEL: mul_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vmul.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %v2 = mul <8 x i16> %v0, %v1
++  store <8 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @mul_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mul_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vmul.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %v2 = mul <4 x i32> %v0, %v1
++  store <4 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @mul_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mul_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vmul.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %v2 = mul <2 x i64> %v0, %v1
++  store <2 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @mul_square_v16i8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_square_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vmul.b $vr0, $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = mul <16 x i8> %v0, %v0
++  store <16 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @mul_square_v8i16(ptr %res, ptr %a0)  nounwind {
++; CHECK-LABEL: mul_square_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vmul.h $vr0, $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = mul <8 x i16> %v0, %v0
++  store <8 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @mul_square_v4i32(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_square_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vmul.w $vr0, $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = mul <4 x i32> %v0, %v0
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @mul_square_v2i64(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_square_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vmul.d $vr0, $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = mul <2 x i64> %v0, %v0
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
++
++define void @mul_v16i8_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_v16i8_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslli.b $vr0, $vr0, 3
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = mul <16 x i8> %v0, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
++  store <16 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @mul_v8i16_8(ptr %res, ptr %a0)  nounwind {
++; CHECK-LABEL: mul_v8i16_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslli.h $vr0, $vr0, 3
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = mul <8 x i16> %v0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
++  store <8 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @mul_v4i32_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_v4i32_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslli.w $vr0, $vr0, 3
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = mul <4 x i32> %v0, <i32 8, i32 8, i32 8, i32 8>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @mul_v2i64_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_v2i64_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslli.d $vr0, $vr0, 3
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = mul <2 x i64> %v0, <i64 8, i64 8>
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
++
++define void @mul_v16i8_17(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_v16i8_17:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    ori $a2, $zero, 17
++; CHECK-NEXT:    vreplgr2vr.b $vr0, $a2
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vmul.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = mul <16 x i8> %v0, <i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17>
++  store <16 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @mul_v8i16_17(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_v8i16_17:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    ori $a2, $zero, 17
++; CHECK-NEXT:    vreplgr2vr.h $vr0, $a2
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vmul.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = mul <8 x i16> %v0, <i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17>
++  store <8 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @mul_v4i32_17(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_v4i32_17:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    ori $a2, $zero, 17
++; CHECK-NEXT:    vreplgr2vr.w $vr0, $a2
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vmul.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = mul <4 x i32> %v0, <i32 17, i32 17, i32 17, i32 17>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @mul_v2i64_17(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_v2i64_17:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    ori $a2, $zero, 17
++; CHECK-NEXT:    vreplgr2vr.d $vr0, $a2
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vmul.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = mul <2 x i64> %v0, <i64 17, i64 17>
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sdiv.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sdiv.ll
+new file mode 100644
+index 000000000000..b68f73a74913
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sdiv.ll
+@@ -0,0 +1,134 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @sdiv_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: sdiv_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vdiv.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %v2 = sdiv <16 x i8> %v0, %v1
++  store <16 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @sdiv_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: sdiv_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vdiv.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %v2 = sdiv <8 x i16> %v0, %v1
++  store <8 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @sdiv_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: sdiv_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vdiv.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %v2 = sdiv <4 x i32> %v0, %v1
++  store <4 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @sdiv_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: sdiv_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vdiv.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %v2 = sdiv <2 x i64> %v0, %v1
++  store <2 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @sdiv_v16i8_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sdiv_v16i8_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrai.b $vr1, $vr0, 7
++; CHECK-NEXT:    vsrli.b $vr1, $vr1, 5
++; CHECK-NEXT:    vadd.b $vr0, $vr0, $vr1
++; CHECK-NEXT:    vsrai.b $vr0, $vr0, 3
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = sdiv <16 x i8> %v0, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
++  store <16 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @sdiv_v8i16_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sdiv_v8i16_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrai.h $vr1, $vr0, 15
++; CHECK-NEXT:    vsrli.h $vr1, $vr1, 13
++; CHECK-NEXT:    vadd.h $vr0, $vr0, $vr1
++; CHECK-NEXT:    vsrai.h $vr0, $vr0, 3
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = sdiv <8 x i16> %v0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
++  store <8 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @sdiv_v4i32_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sdiv_v4i32_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrai.w $vr1, $vr0, 31
++; CHECK-NEXT:    vsrli.w $vr1, $vr1, 29
++; CHECK-NEXT:    vadd.w $vr0, $vr0, $vr1
++; CHECK-NEXT:    vsrai.w $vr0, $vr0, 3
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = sdiv <4 x i32> %v0, <i32 8, i32 8, i32 8, i32 8>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @sdiv_v2i64_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sdiv_v2i64_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrai.d $vr1, $vr0, 63
++; CHECK-NEXT:    vsrli.d $vr1, $vr1, 61
++; CHECK-NEXT:    vadd.d $vr0, $vr0, $vr1
++; CHECK-NEXT:    vsrai.d $vr0, $vr0, 3
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = sdiv <2 x i64> %v0, <i64 8, i64 8>
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shl.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shl.ll
+new file mode 100644
+index 000000000000..fa0aebaf28b3
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shl.ll
+@@ -0,0 +1,178 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @shl_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: shl_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsll.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %v2 = shl <16 x i8> %v0, %v1
++  store <16 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @shl_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: shl_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsll.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %v2 = shl <8 x i16> %v0, %v1
++  store <8 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @shl_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: shl_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsll.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %v2 = shl <4 x i32> %v0, %v1
++  store <4 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @shl_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: shl_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsll.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %v2 = shl <2 x i64> %v0, %v1
++  store <2 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @shl_v16i8_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: shl_v16i8_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslli.b $vr0, $vr0, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = shl <16 x i8> %v0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
++  store <16 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @shl_v16i8_7(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: shl_v16i8_7:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslli.b $vr0, $vr0, 7
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = shl <16 x i8> %v0, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
++  store <16 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @shl_v8i16_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: shl_v8i16_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslli.h $vr0, $vr0, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = shl <8 x i16> %v0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
++  store <8 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @shl_v8i16_15(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: shl_v8i16_15:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslli.h $vr0, $vr0, 15
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = shl <8 x i16> %v0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
++  store <8 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @shl_v4i32_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: shl_v4i32_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslli.w $vr0, $vr0, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = shl <4 x i32> %v0, <i32 1, i32 1, i32 1, i32 1>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @shl_v4i32_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: shl_v4i32_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslli.w $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = shl <4 x i32> %v0, <i32 31, i32 31, i32 31, i32 31>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @shl_v2i64_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: shl_v2i64_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslli.d $vr0, $vr0, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = shl <2 x i64> %v0, <i64 1, i64 1>
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
++
++define void @shl_v2i64_63(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: shl_v2i64_63:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslli.d $vr0, $vr0, 63
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = shl <2 x i64> %v0, <i64 63, i64 63>
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sub.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sub.ll
+new file mode 100644
+index 000000000000..25b4623a47d1
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sub.ll
+@@ -0,0 +1,122 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @sub_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: sub_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsub.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %v2 = sub <16 x i8> %v0, %v1
++  store <16 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @sub_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: sub_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsub.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %v2 = sub <8 x i16> %v0, %v1
++  store <8 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @sub_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: sub_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsub.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %v2 = sub <4 x i32> %v0, %v1
++  store <4 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @sub_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: sub_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsub.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %v2 = sub <2 x i64> %v0, %v1
++  store <2 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @sub_v16i8_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sub_v16i8_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsubi.bu $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = sub <16 x i8> %v0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
++  store <16 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @sub_v8i16_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sub_v8i16_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsubi.hu $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = sub <8 x i16> %v0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
++  store <8 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @sub_v4i32_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sub_v4i32_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsubi.wu $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = sub <4 x i32> %v0, <i32 31, i32 31, i32 31, i32 31>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @sub_v2i64_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sub_v2i64_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsubi.du $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = sub <2 x i64> %v0, <i64 31, i64 31>
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/udiv.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/udiv.ll
+new file mode 100644
+index 000000000000..abb60b91dd48
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/udiv.ll
+@@ -0,0 +1,122 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @udiv_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: udiv_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vdiv.bu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %v2 = udiv <16 x i8> %v0, %v1
++  store <16 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @udiv_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: udiv_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vdiv.hu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %v2 = udiv <8 x i16> %v0, %v1
++  store <8 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @udiv_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: udiv_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vdiv.wu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %v2 = udiv <4 x i32> %v0, %v1
++  store <4 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @udiv_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: udiv_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vdiv.du $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %v2 = udiv <2 x i64> %v0, %v1
++  store <2 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @udiv_v16i8_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: udiv_v16i8_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrli.b $vr0, $vr0, 3
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = udiv <16 x i8> %v0, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
++  store <16 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @udiv_v8i16_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: udiv_v8i16_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrli.h $vr0, $vr0, 3
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = udiv <8 x i16> %v0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
++  store <8 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @udiv_v4i32_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: udiv_v4i32_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrli.w $vr0, $vr0, 3
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = udiv <4 x i32> %v0, <i32 8, i32 8, i32 8, i32 8>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @udiv_v2i64_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: udiv_v2i64_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vsrli.d $vr0, $vr0, 3
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = udiv <2 x i64> %v0, <i64 8, i64 8>
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
+-- 
+2.20.1
+
+
+From 30b414d9f2eb968e9f4cc6ffc76389a6f93b2907 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Thu, 16 Nov 2023 20:05:01 +0800
+Subject: [PATCH 15/35] [LoongArch] Add codegen support for insertelement
+
+(cherry picked from commit f2cbd1fdf702afe31d0198c9185e08dc2b104252)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       |  82 +++++-
+ .../Target/LoongArch/LoongArchISelLowering.h  |   1 +
+ .../LoongArch/LoongArchLASXInstrInfo.td       |  18 ++
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td |   5 +
+ .../lasx/ir-instruction/insertelement.ll      | 276 ++++++++++++++++++
+ .../lsx/ir-instruction/insertelement.ll       | 196 +++++++++++++
+ 6 files changed, 576 insertions(+), 2 deletions(-)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index 26e94a53b344..492339ce2151 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -237,7 +237,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       // FIXME: For BUILD_VECTOR, it is temporarily set to `Legal` here, and it
+       // will be `Custom` handled in the future.
+       setOperationAction(ISD::BUILD_VECTOR, VT, Legal);
+-      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Legal);
++      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
+     }
+     for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+@@ -267,7 +267,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+ 
+       // FIXME: Same as above.
+       setOperationAction(ISD::BUILD_VECTOR, VT, Legal);
+-      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Legal);
++      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
+     }
+     for (MVT VT : {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8}) {
+@@ -369,10 +369,20 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
+     return lowerRETURNADDR(Op, DAG);
+   case ISD::WRITE_REGISTER:
+     return lowerWRITE_REGISTER(Op, DAG);
++  case ISD::INSERT_VECTOR_ELT:
++    return lowerINSERT_VECTOR_ELT(Op, DAG);
+   }
+   return SDValue();
+ }
+ 
++SDValue
++LoongArchTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
++                                                SelectionDAG &DAG) const {
++  if (isa<ConstantSDNode>(Op->getOperand(2)))
++    return Op;
++  return SDValue();
++}
++
+ SDValue LoongArchTargetLowering::lowerWRITE_REGISTER(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+ 
+@@ -3040,6 +3050,71 @@ emitVecCondBranchPseudo(MachineInstr &MI, MachineBasicBlock *BB,
+   return SinkBB;
+ }
+ 
++static MachineBasicBlock *
++emitPseudoXVINSGR2VR(MachineInstr &MI, MachineBasicBlock *BB,
++                     const LoongArchSubtarget &Subtarget) {
++  unsigned InsOp;
++  unsigned HalfSize;
++  switch (MI.getOpcode()) {
++  default:
++    llvm_unreachable("Unexpected opcode");
++  case LoongArch::PseudoXVINSGR2VR_B:
++    HalfSize = 16;
++    InsOp = LoongArch::VINSGR2VR_B;
++    break;
++  case LoongArch::PseudoXVINSGR2VR_H:
++    HalfSize = 8;
++    InsOp = LoongArch::VINSGR2VR_H;
++    break;
++  }
++  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
++  const TargetRegisterClass *RC = &LoongArch::LASX256RegClass;
++  const TargetRegisterClass *SubRC = &LoongArch::LSX128RegClass;
++  DebugLoc DL = MI.getDebugLoc();
++  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
++  // XDst = vector_insert XSrc, Elt, Idx
++  Register XDst = MI.getOperand(0).getReg();
++  Register XSrc = MI.getOperand(1).getReg();
++  Register Elt = MI.getOperand(2).getReg();
++  unsigned Idx = MI.getOperand(3).getImm();
++
++  Register ScratchReg1 = XSrc;
++  if (Idx >= HalfSize) {
++    ScratchReg1 = MRI.createVirtualRegister(RC);
++    BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_Q), ScratchReg1)
++        .addReg(XSrc)
++        .addReg(XSrc)
++        .addImm(1);
++  }
++
++  Register ScratchSubReg1 = MRI.createVirtualRegister(SubRC);
++  Register ScratchSubReg2 = MRI.createVirtualRegister(SubRC);
++  BuildMI(*BB, MI, DL, TII->get(LoongArch::COPY), ScratchSubReg1)
++      .addReg(ScratchReg1, 0, LoongArch::sub_128);
++  BuildMI(*BB, MI, DL, TII->get(InsOp), ScratchSubReg2)
++      .addReg(ScratchSubReg1)
++      .addReg(Elt)
++      .addImm(Idx >= HalfSize ? Idx - HalfSize : Idx);
++
++  Register ScratchReg2 = XDst;
++  if (Idx >= HalfSize)
++    ScratchReg2 = MRI.createVirtualRegister(RC);
++
++  BuildMI(*BB, MI, DL, TII->get(LoongArch::SUBREG_TO_REG), ScratchReg2)
++      .addImm(0)
++      .addReg(ScratchSubReg2)
++      .addImm(LoongArch::sub_128);
++
++  if (Idx >= HalfSize)
++    BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_Q), XDst)
++        .addReg(XSrc)
++        .addReg(ScratchReg2)
++        .addImm(2);
++
++  MI.eraseFromParent();
++  return BB;
++}
++
+ MachineBasicBlock *LoongArchTargetLowering::EmitInstrWithCustomInserter(
+     MachineInstr &MI, MachineBasicBlock *BB) const {
+   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+@@ -3095,6 +3170,9 @@ MachineBasicBlock *LoongArchTargetLowering::EmitInstrWithCustomInserter(
+   case LoongArch::PseudoXVBNZ_W:
+   case LoongArch::PseudoXVBNZ_D:
+     return emitVecCondBranchPseudo(MI, BB, Subtarget);
++  case LoongArch::PseudoXVINSGR2VR_B:
++  case LoongArch::PseudoXVINSGR2VR_H:
++    return emitPseudoXVINSGR2VR(MI, BB, Subtarget);
+   }
+ }
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+index 7765057ebffb..29028ff963d0 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+@@ -275,6 +275,7 @@ private:
+   SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) const;
++  SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ 
+   bool isFPImmLegal(const APFloat &Imm, EVT VT,
+                     bool ForCodeSize) const override;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index 380206ddcf10..475565db15c9 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -1065,6 +1065,13 @@ def PseudoXVBZ_W : VecCond<loongarch_vall_zero, v8i32, LASX256>;
+ def PseudoXVBZ_D : VecCond<loongarch_vall_zero, v4i64, LASX256>;
+ def PseudoXVBZ : VecCond<loongarch_vany_zero, v32i8, LASX256>;
+ 
++let usesCustomInserter = 1, Constraints = "$xd = $dst" in {
++def PseudoXVINSGR2VR_B
++  : Pseudo<(outs LASX256:$dst), (ins LASX256:$xd, GPR:$rj, uimm5:$imm)>;
++def PseudoXVINSGR2VR_H
++  : Pseudo<(outs LASX256:$dst), (ins LASX256:$xd, GPR:$rj, uimm4:$imm)>;
++} //  usesCustomInserter = 1, Constraints = "$xd = $dst"
++
+ } // Predicates = [HasExtLASX]
+ 
+ multiclass PatXr<SDPatternOperator OpNode, string Inst> {
+@@ -1365,12 +1372,23 @@ def : Pat<(fma v8f32:$xj, v8f32:$xk, v8f32:$xa),
+ def : Pat<(fma v4f64:$xj, v4f64:$xk, v4f64:$xa),
+           (XVFMADD_D v4f64:$xj, v4f64:$xk, v4f64:$xa)>;
+ 
++// PseudoXVINSGR2VR_{B/H}
++def : Pat<(vector_insert v32i8:$xd, GRLenVT:$rj, uimm5:$imm),
++          (PseudoXVINSGR2VR_B v32i8:$xd, GRLenVT:$rj, uimm5:$imm)>;
++def : Pat<(vector_insert v16i16:$xd, GRLenVT:$rj, uimm4:$imm),
++          (PseudoXVINSGR2VR_H v16i16:$xd, GRLenVT:$rj, uimm4:$imm)>;
++
+ // XVINSGR2VR_{W/D}
+ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
+           (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
+ def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
+           (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;
+ 
++def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
++          (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
++def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
++          (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
++
+ // XVPICKVE2GR_W[U]
+ def : Pat<(loongarch_vpick_sext_elt v8i32:$xd, uimm3:$imm, i32),
+           (XVPICKVE2GR_W v8i32:$xd, uimm3:$imm)>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index 980870e34503..d8fd132a1c59 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -1462,6 +1462,11 @@ def : Pat<(vector_insert v4i32:$vd, GRLenVT:$rj, uimm2:$imm),
+ def : Pat<(vector_insert v2i64:$vd, GRLenVT:$rj, uimm1:$imm),
+           (VINSGR2VR_D v2i64:$vd, GRLenVT:$rj, uimm1:$imm)>;
+ 
++def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm),
++          (VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>;
++def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm),
++          (VINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm1:$imm)>;
++
+ // VPICKVE2GR_{B/H/W}[U]
+ def : Pat<(loongarch_vpick_sext_elt v16i8:$vd, uimm4:$imm, i8),
+           (VPICKVE2GR_B v16i8:$vd, uimm4:$imm)>;
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
+new file mode 100644
+index 000000000000..e571a5d2e4cf
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
+@@ -0,0 +1,276 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @insert_32xi8(ptr %src, ptr %dst, i8 %in) nounwind {
++; CHECK-LABEL: insert_32xi8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a2, 1
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <32 x i8>, ptr %src
++  %v_new = insertelement <32 x i8> %v, i8 %in, i32 1
++  store <32 x i8> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_32xi8_upper(ptr %src, ptr %dst, i8 %in) nounwind {
++; CHECK-LABEL: insert_32xi8_upper:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr1, $a2, 0
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <32 x i8>, ptr %src
++  %v_new = insertelement <32 x i8> %v, i8 %in, i32 16
++  store <32 x i8> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_16xi16(ptr %src, ptr %dst, i16 %in) nounwind {
++; CHECK-LABEL: insert_16xi16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a2, 1
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <16 x i16>, ptr %src
++  %v_new = insertelement <16 x i16> %v, i16 %in, i32 1
++  store <16 x i16> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_16xi16_upper(ptr %src, ptr %dst, i16 %in) nounwind {
++; CHECK-LABEL: insert_16xi16_upper:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.h $vr1, $a2, 0
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <16 x i16>, ptr %src
++  %v_new = insertelement <16 x i16> %v, i16 %in, i32 8
++  store <16 x i16> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_8xi32(ptr %src, ptr %dst, i32 %in) nounwind {
++; CHECK-LABEL: insert_8xi32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a2, 1
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <8 x i32>, ptr %src
++  %v_new = insertelement <8 x i32> %v, i32 %in, i32 1
++  store <8 x i32> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_4xi64(ptr %src, ptr %dst, i64 %in) nounwind {
++; CHECK-LABEL: insert_4xi64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a2, 1
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <4 x i64>, ptr %src
++  %v_new = insertelement <4 x i64> %v, i64 %in, i32 1
++  store <4 x i64> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_8xfloat(ptr %src, ptr %dst, float %in) nounwind {
++; CHECK-LABEL: insert_8xfloat:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    movfr2gr.s $a2, $fa0
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a2, 1
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <8 x float>, ptr %src
++  %v_new = insertelement <8 x float> %v, float %in, i32 1
++  store <8 x float> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_4xdouble(ptr %src, ptr %dst, double %in) nounwind {
++; CHECK-LABEL: insert_4xdouble:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    movfr2gr.d $a2, $fa0
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a2, 1
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <4 x double>, ptr %src
++  %v_new = insertelement <4 x double> %v, double %in, i32 1
++  store <4 x double> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind {
++; CHECK-LABEL: insert_32xi8_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    addi.d $sp, $sp, -64
++; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; CHECK-NEXT:    addi.d $fp, $sp, 64
++; CHECK-NEXT:    srli.d $a4, $sp, 5
++; CHECK-NEXT:    slli.d $sp, $a4, 5
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvst $xr0, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 0
++; CHECK-NEXT:    st.b $a2, $a0, 0
++; CHECK-NEXT:    xvld $xr0, $sp, 0
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $fp, -64
++; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; CHECK-NEXT:    addi.d $sp, $sp, 64
++; CHECK-NEXT:    ret
++  %v = load volatile <32 x i8>, ptr %src
++  %v_new = insertelement <32 x i8> %v, i8 %in, i32 %idx
++  store <32 x i8> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind {
++; CHECK-LABEL: insert_16xi16_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    addi.d $sp, $sp, -64
++; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; CHECK-NEXT:    addi.d $fp, $sp, 64
++; CHECK-NEXT:    srli.d $a4, $sp, 5
++; CHECK-NEXT:    slli.d $sp, $a4, 5
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvst $xr0, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 1
++; CHECK-NEXT:    st.h $a2, $a0, 0
++; CHECK-NEXT:    xvld $xr0, $sp, 0
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $fp, -64
++; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; CHECK-NEXT:    addi.d $sp, $sp, 64
++; CHECK-NEXT:    ret
++  %v = load volatile <16 x i16>, ptr %src
++  %v_new = insertelement <16 x i16> %v, i16 %in, i32 %idx
++  store <16 x i16> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind {
++; CHECK-LABEL: insert_8xi32_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    addi.d $sp, $sp, -64
++; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; CHECK-NEXT:    addi.d $fp, $sp, 64
++; CHECK-NEXT:    srli.d $a4, $sp, 5
++; CHECK-NEXT:    slli.d $sp, $a4, 5
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvst $xr0, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 2
++; CHECK-NEXT:    st.w $a2, $a0, 0
++; CHECK-NEXT:    xvld $xr0, $sp, 0
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $fp, -64
++; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; CHECK-NEXT:    addi.d $sp, $sp, 64
++; CHECK-NEXT:    ret
++  %v = load volatile <8 x i32>, ptr %src
++  %v_new = insertelement <8 x i32> %v, i32 %in, i32 %idx
++  store <8 x i32> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind {
++; CHECK-LABEL: insert_4xi64_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    addi.d $sp, $sp, -64
++; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; CHECK-NEXT:    addi.d $fp, $sp, 64
++; CHECK-NEXT:    srli.d $a4, $sp, 5
++; CHECK-NEXT:    slli.d $sp, $a4, 5
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvst $xr0, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 3
++; CHECK-NEXT:    st.d $a2, $a0, 0
++; CHECK-NEXT:    xvld $xr0, $sp, 0
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $fp, -64
++; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; CHECK-NEXT:    addi.d $sp, $sp, 64
++; CHECK-NEXT:    ret
++  %v = load volatile <4 x i64>, ptr %src
++  %v_new = insertelement <4 x i64> %v, i64 %in, i32 %idx
++  store <4 x i64> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwind {
++; CHECK-LABEL: insert_8xfloat_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    addi.d $sp, $sp, -64
++; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; CHECK-NEXT:    addi.d $fp, $sp, 64
++; CHECK-NEXT:    srli.d $a3, $sp, 5
++; CHECK-NEXT:    slli.d $sp, $a3, 5
++; CHECK-NEXT:    xvld $xr1, $a0, 0
++; CHECK-NEXT:    xvst $xr1, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
++; CHECK-NEXT:    fst.s $fa0, $a0, 0
++; CHECK-NEXT:    xvld $xr0, $sp, 0
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $fp, -64
++; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; CHECK-NEXT:    addi.d $sp, $sp, 64
++; CHECK-NEXT:    ret
++  %v = load volatile <8 x float>, ptr %src
++  %v_new = insertelement <8 x float> %v, float %in, i32 %idx
++  store <8 x float> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_4xdouble_idx(ptr %src, ptr %dst, double %in, i32 %idx) nounwind {
++; CHECK-LABEL: insert_4xdouble_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    addi.d $sp, $sp, -64
++; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; CHECK-NEXT:    addi.d $fp, $sp, 64
++; CHECK-NEXT:    srli.d $a3, $sp, 5
++; CHECK-NEXT:    slli.d $sp, $a3, 5
++; CHECK-NEXT:    xvld $xr1, $a0, 0
++; CHECK-NEXT:    xvst $xr1, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
++; CHECK-NEXT:    fst.d $fa0, $a0, 0
++; CHECK-NEXT:    xvld $xr0, $sp, 0
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $fp, -64
++; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; CHECK-NEXT:    addi.d $sp, $sp, 64
++; CHECK-NEXT:    ret
++  %v = load volatile <4 x double>, ptr %src
++  %v_new = insertelement <4 x double> %v, double %in, i32 %idx
++  store <4 x double> %v_new, ptr %dst
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
+new file mode 100644
+index 000000000000..a9834591aa0e
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll
+@@ -0,0 +1,196 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @insert_16xi8(ptr %src, ptr %dst, i8 %ins) nounwind {
++; CHECK-LABEL: insert_16xi8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a2, 1
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <16 x i8>, ptr %src
++  %v_new = insertelement <16 x i8> %v, i8 %ins, i32 1
++  store <16 x i8> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_8xi16(ptr %src, ptr %dst, i16 %ins) nounwind {
++; CHECK-LABEL: insert_8xi16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a2, 1
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <8 x i16>, ptr %src
++  %v_new = insertelement <8 x i16> %v, i16 %ins, i32 1
++  store <8 x i16> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_4xi32(ptr %src, ptr %dst, i32 %ins) nounwind {
++; CHECK-LABEL: insert_4xi32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vinsgr2vr.w $vr0, $a2, 1
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <4 x i32>, ptr %src
++  %v_new = insertelement <4 x i32> %v, i32 %ins, i32 1
++  store <4 x i32> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_2xi64(ptr %src, ptr %dst, i64 %ins) nounwind {
++; CHECK-LABEL: insert_2xi64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vinsgr2vr.d $vr0, $a2, 1
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <2 x i64>, ptr %src
++  %v_new = insertelement <2 x i64> %v, i64 %ins, i32 1
++  store <2 x i64> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_4xfloat(ptr %src, ptr %dst, float %ins) nounwind {
++; CHECK-LABEL: insert_4xfloat:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    movfr2gr.s $a2, $fa0
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vinsgr2vr.w $vr0, $a2, 1
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <4 x float>, ptr %src
++  %v_new = insertelement <4 x float> %v, float %ins, i32 1
++  store <4 x float> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_2xdouble(ptr %src, ptr %dst, double %ins) nounwind {
++; CHECK-LABEL: insert_2xdouble:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    movfr2gr.d $a2, $fa0
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vinsgr2vr.d $vr0, $a2, 1
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load volatile <2 x double>, ptr %src
++  %v_new = insertelement <2 x double> %v, double %ins, i32 1
++  store <2 x double> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_16xi8_idx(ptr %src, ptr %dst, i8 %ins, i32 %idx) nounwind {
++; CHECK-LABEL: insert_16xi8_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    addi.d $sp, $sp, -16
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vst $vr0, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a3, 3, 0
++; CHECK-NEXT:    st.b $a2, $a0, 0
++; CHECK-NEXT:    vld $vr0, $sp, 0
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $sp, 16
++; CHECK-NEXT:    ret
++  %v = load volatile <16 x i8>, ptr %src
++  %v_new = insertelement <16 x i8> %v, i8 %ins, i32 %idx
++  store <16 x i8> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_8xi16_idx(ptr %src, ptr %dst, i16 %ins, i32 %idx) nounwind {
++; CHECK-LABEL: insert_8xi16_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    addi.d $sp, $sp, -16
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vst $vr0, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a3, 3, 1
++; CHECK-NEXT:    st.h $a2, $a0, 0
++; CHECK-NEXT:    vld $vr0, $sp, 0
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $sp, 16
++; CHECK-NEXT:    ret
++  %v = load volatile <8 x i16>, ptr %src
++  %v_new = insertelement <8 x i16> %v, i16 %ins, i32 %idx
++  store <8 x i16> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_4xi32_idx(ptr %src, ptr %dst, i32 %ins, i32 %idx) nounwind {
++; CHECK-LABEL: insert_4xi32_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    addi.d $sp, $sp, -16
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vst $vr0, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a3, 3, 2
++; CHECK-NEXT:    st.w $a2, $a0, 0
++; CHECK-NEXT:    vld $vr0, $sp, 0
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $sp, 16
++; CHECK-NEXT:    ret
++  %v = load volatile <4 x i32>, ptr %src
++  %v_new = insertelement <4 x i32> %v, i32 %ins, i32 %idx
++  store <4 x i32> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_2xi64_idx(ptr %src, ptr %dst, i64 %ins, i32 %idx) nounwind {
++; CHECK-LABEL: insert_2xi64_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    addi.d $sp, $sp, -16
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vst $vr0, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a3, 3, 3
++; CHECK-NEXT:    st.d $a2, $a0, 0
++; CHECK-NEXT:    vld $vr0, $sp, 0
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $sp, 16
++; CHECK-NEXT:    ret
++  %v = load volatile <2 x i64>, ptr %src
++  %v_new = insertelement <2 x i64> %v, i64 %ins, i32 %idx
++  store <2 x i64> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_4xfloat_idx(ptr %src, ptr %dst, float %ins, i32 %idx) nounwind {
++; CHECK-LABEL: insert_4xfloat_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    addi.d $sp, $sp, -16
++; CHECK-NEXT:    vld $vr1, $a0, 0
++; CHECK-NEXT:    vst $vr1, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a2, 3, 2
++; CHECK-NEXT:    fst.s $fa0, $a0, 0
++; CHECK-NEXT:    vld $vr0, $sp, 0
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $sp, 16
++; CHECK-NEXT:    ret
++  %v = load volatile <4 x float>, ptr %src
++  %v_new = insertelement <4 x float> %v, float %ins, i32 %idx
++  store <4 x float> %v_new, ptr %dst
++  ret void
++}
++
++define void @insert_2xdouble_idx(ptr %src, ptr %dst, double %ins, i32 %idx) nounwind {
++; CHECK-LABEL: insert_2xdouble_idx:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    addi.d $sp, $sp, -16
++; CHECK-NEXT:    vld $vr1, $a0, 0
++; CHECK-NEXT:    vst $vr1, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a2, 3, 3
++; CHECK-NEXT:    fst.d $fa0, $a0, 0
++; CHECK-NEXT:    vld $vr0, $sp, 0
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $sp, 16
++; CHECK-NEXT:    ret
++  %v = load volatile <2 x double>, ptr %src
++  %v_new = insertelement <2 x double> %v, double %ins, i32 %idx
++  store <2 x double> %v_new, ptr %dst
++  ret void
++}
+-- 
+2.20.1
+
+
+From 8c15dfb5e6c74537f5748936702e4d077d09815d Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Wed, 25 Oct 2023 17:00:32 +0800
+Subject: [PATCH 16/35] [LoongArch] Custom lowering `ISD::BUILD_VECTOR`
+
+(cherry picked from commit add224c0a094d20389d3659f7b6e496df461a976)
+---
+ .../LoongArch/LoongArchISelDAGToDAG.cpp       |  52 +-
+ .../LoongArch/LoongArchISelLowering.cpp       | 102 +++-
+ .../Target/LoongArch/LoongArchISelLowering.h  |   1 +
+ .../LoongArch/LoongArchLASXInstrInfo.td       |  13 +
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td |  12 +-
+ .../CodeGen/LoongArch/lasx/build-vector.ll    | 551 ++++++++++++++++++
+ .../CodeGen/LoongArch/lsx/build-vector.ll     | 376 ++++++++++++
+ .../LoongArch/lsx/ir-instruction/mul.ll       |  28 +-
+ 8 files changed, 1112 insertions(+), 23 deletions(-)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
+index f55184019988..01b2f720f902 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
+@@ -77,13 +77,63 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) {
+     return;
+   }
+   case ISD::BITCAST: {
+-    if (VT.is128BitVector() || VT.is512BitVector()) {
++    if (VT.is128BitVector() || VT.is256BitVector()) {
+       ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
+       CurDAG->RemoveDeadNode(Node);
+       return;
+     }
+     break;
+   }
++  case ISD::BUILD_VECTOR: {
++    // Select appropriate [x]vrepli.[bhwd] instructions for constant splats of
++    // 128/256-bit when LSX/LASX is enabled.
++    BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Node);
++    APInt SplatValue, SplatUndef;
++    unsigned SplatBitSize;
++    bool HasAnyUndefs;
++    unsigned Op;
++    EVT ViaVecTy;
++    bool Is128Vec = BVN->getValueType(0).is128BitVector();
++    bool Is256Vec = BVN->getValueType(0).is256BitVector();
++
++    if (!Subtarget->hasExtLSX() || (!Is128Vec && !Is256Vec))
++      break;
++    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
++                              HasAnyUndefs, 8))
++      break;
++
++    switch (SplatBitSize) {
++    default:
++      break;
++    case 8:
++      Op = Is256Vec ? LoongArch::PseudoXVREPLI_B : LoongArch::PseudoVREPLI_B;
++      ViaVecTy = Is256Vec ? MVT::v32i8 : MVT::v16i8;
++      break;
++    case 16:
++      Op = Is256Vec ? LoongArch::PseudoXVREPLI_H : LoongArch::PseudoVREPLI_H;
++      ViaVecTy = Is256Vec ? MVT::v16i16 : MVT::v8i16;
++      break;
++    case 32:
++      Op = Is256Vec ? LoongArch::PseudoXVREPLI_W : LoongArch::PseudoVREPLI_W;
++      ViaVecTy = Is256Vec ? MVT::v8i32 : MVT::v4i32;
++      break;
++    case 64:
++      Op = Is256Vec ? LoongArch::PseudoXVREPLI_D : LoongArch::PseudoVREPLI_D;
++      ViaVecTy = Is256Vec ? MVT::v4i64 : MVT::v2i64;
++      break;
++    }
++
++    SDNode *Res;
++    // If we have a signed 10 bit integer, we can splat it directly.
++    if (SplatValue.isSignedIntN(10)) {
++      SDValue Imm = CurDAG->getTargetConstant(SplatValue, DL,
++                                              ViaVecTy.getVectorElementType());
++      Res = CurDAG->getMachineNode(Op, DL, ViaVecTy, Imm);
++      ReplaceNode(Node, Res);
++      return;
++    }
++    break;
++  }
+   }
+ 
+   // Select the default instruction.
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index 492339ce2151..1b60bfc3bddb 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -234,11 +234,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       setOperationAction(ISD::BITCAST, VT, Legal);
+       setOperationAction(ISD::UNDEF, VT, Legal);
+ 
+-      // FIXME: For BUILD_VECTOR, it is temporarily set to `Legal` here, and it
+-      // will be `Custom` handled in the future.
+-      setOperationAction(ISD::BUILD_VECTOR, VT, Legal);
+       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
++      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+     }
+     for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
+@@ -265,10 +263,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       setOperationAction(ISD::BITCAST, VT, Legal);
+       setOperationAction(ISD::UNDEF, VT, Legal);
+ 
+-      // FIXME: Same as above.
+-      setOperationAction(ISD::BUILD_VECTOR, VT, Legal);
+       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
++      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+     }
+     for (MVT VT : {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8}) {
+       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
+@@ -371,10 +368,105 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
+     return lowerWRITE_REGISTER(Op, DAG);
+   case ISD::INSERT_VECTOR_ELT:
+     return lowerINSERT_VECTOR_ELT(Op, DAG);
++  case ISD::BUILD_VECTOR:
++    return lowerBUILD_VECTOR(Op, DAG);
+   }
+   return SDValue();
+ }
+ 
++static bool isConstantOrUndef(const SDValue Op) {
++  if (Op->isUndef())
++    return true;
++  if (isa<ConstantSDNode>(Op))
++    return true;
++  if (isa<ConstantFPSDNode>(Op))
++    return true;
++  return false;
++}
++
++static bool isConstantOrUndefBUILD_VECTOR(const BuildVectorSDNode *Op) {
++  for (unsigned i = 0; i < Op->getNumOperands(); ++i)
++    if (isConstantOrUndef(Op->getOperand(i)))
++      return true;
++  return false;
++}
++
++SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
++                                                   SelectionDAG &DAG) const {
++  BuildVectorSDNode *Node = cast<BuildVectorSDNode>(Op);
++  EVT ResTy = Op->getValueType(0);
++  SDLoc DL(Op);
++  APInt SplatValue, SplatUndef;
++  unsigned SplatBitSize;
++  bool HasAnyUndefs;
++  bool Is128Vec = ResTy.is128BitVector();
++  bool Is256Vec = ResTy.is256BitVector();
++
++  if ((!Subtarget.hasExtLSX() || !Is128Vec) &&
++      (!Subtarget.hasExtLASX() || !Is256Vec))
++    return SDValue();
++
++  if (Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
++                            /*MinSplatBits=*/8) &&
++      SplatBitSize <= 64) {
++    // We can only cope with 8, 16, 32, or 64-bit elements.
++    if (SplatBitSize != 8 && SplatBitSize != 16 && SplatBitSize != 32 &&
++        SplatBitSize != 64)
++      return SDValue();
++
++    EVT ViaVecTy;
++
++    switch (SplatBitSize) {
++    default:
++      return SDValue();
++    case 8:
++      ViaVecTy = Is128Vec ? MVT::v16i8 : MVT::v32i8;
++      break;
++    case 16:
++      ViaVecTy = Is128Vec ? MVT::v8i16 : MVT::v16i16;
++      break;
++    case 32:
++      ViaVecTy = Is128Vec ? MVT::v4i32 : MVT::v8i32;
++      break;
++    case 64:
++      ViaVecTy = Is128Vec ? MVT::v2i64 : MVT::v4i64;
++      break;
++    }
++
++    // SelectionDAG::getConstant will promote SplatValue appropriately.
++    SDValue Result = DAG.getConstant(SplatValue, DL, ViaVecTy);
++
++    // Bitcast to the type we originally wanted.
++    if (ViaVecTy != ResTy)
++      Result = DAG.getNode(ISD::BITCAST, SDLoc(Node), ResTy, Result);
++
++    return Result;
++  }
++
++  if (DAG.isSplatValue(Op, /*AllowUndefs=*/false))
++    return Op;
++
++  if (!isConstantOrUndefBUILD_VECTOR(Node)) {
++    // Use INSERT_VECTOR_ELT operations rather than expand to stores.
++    // The resulting code is the same length as the expansion, but it doesn't
++    // use memory operations.
++    EVT ResTy = Node->getValueType(0);
++
++    assert(ResTy.isVector());
++
++    unsigned NumElts = ResTy.getVectorNumElements();
++    SDValue Vector = DAG.getUNDEF(ResTy);
++    for (unsigned i = 0; i < NumElts; ++i) {
++      Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector,
++                           Node->getOperand(i),
++                           DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
++    }
++    return Vector;
++  }
++
++  return SDValue();
++}
++
+ SDValue
+ LoongArchTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+index 29028ff963d0..111376306374 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+@@ -276,6 +276,7 @@ private:
+   SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
++  SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ 
+   bool isFPImmLegal(const APFloat &Imm, EVT VT,
+                     bool ForCodeSize) const override;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index 475565db15c9..4487152fb42b 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -33,6 +33,13 @@ def lasxsplati32
+ def lasxsplati64
+   : PatFrag<(ops node:$e0),
+             (v4i64 (build_vector node:$e0, node:$e0, node:$e0, node:$e0))>;
++def lasxsplatf32
++  : PatFrag<(ops node:$e0),
++            (v8f32 (build_vector node:$e0, node:$e0, node:$e0, node:$e0,
++                                 node:$e0, node:$e0, node:$e0, node:$e0))>;
++def lasxsplatf64
++  : PatFrag<(ops node:$e0),
++            (v4f64 (build_vector node:$e0, node:$e0, node:$e0, node:$e0))>;
+ 
+ //===----------------------------------------------------------------------===//
+ // Instruction class templates
+@@ -1411,6 +1418,12 @@ def : Pat<(loongarch_vreplve v8i32:$xj, GRLenVT:$rk),
+ def : Pat<(loongarch_vreplve v4i64:$xj, GRLenVT:$rk),
+           (XVREPLVE_D v4i64:$xj, GRLenVT:$rk)>;
+ 
++// XVREPL128VEI_{W/D}
++def : Pat<(lasxsplatf32 FPR32:$fj),
++          (XVREPL128VEI_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), 0)>;
++def : Pat<(lasxsplatf64 FPR64:$fj),
++          (XVREPL128VEI_D (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), 0)>;
++
+ // Loads/Stores
+ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in {
+   defm : LdPat<load, XVLD, vt>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index d8fd132a1c59..deac5015882d 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -141,9 +141,13 @@ def lsxsplati16 : PatFrag<(ops node:$e0),
+ def lsxsplati32 : PatFrag<(ops node:$e0),
+                           (v4i32 (build_vector node:$e0, node:$e0,
+                                                node:$e0, node:$e0))>;
+-
+ def lsxsplati64 : PatFrag<(ops node:$e0),
+                           (v2i64 (build_vector node:$e0, node:$e0))>;
++def lsxsplatf32 : PatFrag<(ops node:$e0),
++                          (v4f32 (build_vector node:$e0, node:$e0,
++                                               node:$e0, node:$e0))>;
++def lsxsplatf64 : PatFrag<(ops node:$e0),
++                          (v2f64 (build_vector node:$e0, node:$e0))>;
+ 
+ def to_valid_timm : SDNodeXForm<timm, [{
+   auto CN = cast<ConstantSDNode>(N);
+@@ -1498,6 +1502,12 @@ def : Pat<(loongarch_vreplve v4i32:$vj, GRLenVT:$rk),
+ def : Pat<(loongarch_vreplve v2i64:$vj, GRLenVT:$rk),
+           (VREPLVE_D v2i64:$vj, GRLenVT:$rk)>;
+ 
++// VREPLVEI_{W/D}
++def : Pat<(lsxsplatf32 FPR32:$fj),
++          (VREPLVEI_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), 0)>;
++def : Pat<(lsxsplatf64 FPR64:$fj),
++          (VREPLVEI_D (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), 0)>;
++
+ // Loads/Stores
+ foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
+   defm : LdPat<load, VLD, vt>;
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
+new file mode 100644
+index 000000000000..6824ab5cda8d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
+@@ -0,0 +1,551 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @buildvector_v32i8_splat(ptr %dst, i8 %a0) nounwind {
++; CHECK-LABEL: buildvector_v32i8_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvreplgr2vr.b $xr0, $a1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %insert = insertelement <32 x i8> undef, i8 %a0, i8 0
++  %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
++  store <32 x i8> %splat, ptr %dst
++  ret void
++}
++
++define void @buildvector_v16i16_splat(ptr %dst, i16 %a0) nounwind {
++; CHECK-LABEL: buildvector_v16i16_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvreplgr2vr.h $xr0, $a1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %insert = insertelement <16 x i16> undef, i16 %a0, i8 0
++  %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
++  store <16 x i16> %splat, ptr %dst
++  ret void
++}
++
++define void @buildvector_v8i32_splat(ptr %dst, i32 %a0) nounwind {
++; CHECK-LABEL: buildvector_v8i32_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvreplgr2vr.w $xr0, $a1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %insert = insertelement <8 x i32> undef, i32 %a0, i8 0
++  %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
++  store <8 x i32> %splat, ptr %dst
++  ret void
++}
++
++define void @buildvector_v4i64_splat(ptr %dst, i64 %a0) nounwind {
++; CHECK-LABEL: buildvector_v4i64_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvreplgr2vr.d $xr0, $a1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %insert = insertelement <4 x i64> undef, i64 %a0, i8 0
++  %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
++  store <4 x i64> %splat, ptr %dst
++  ret void
++}
++
++define void @buildvector_v8f32_splat(ptr %dst, float %a0) nounwind {
++; CHECK-LABEL: buildvector_v8f32_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
++; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %insert = insertelement <8 x float> undef, float %a0, i8 0
++  %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer
++  store <8 x float> %splat, ptr %dst
++  ret void
++}
++
++define void @buildvector_v4f64_splat(ptr %dst, double %a0) nounwind {
++; CHECK-LABEL: buildvector_v4f64_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
++; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr0, 0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %insert = insertelement <4 x double> undef, double %a0, i8 0
++  %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer
++  store <4 x double> %splat, ptr %dst
++  ret void
++}
++
++define void @buildvector_v32i8_const_splat(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v32i8_const_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrepli.b $xr0, 1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v16i16_const_splat(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v16i16_const_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrepli.h $xr0, 1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <16 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v8i32_const_splat(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v8i32_const_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrepli.w $xr0, 1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v4i64_const_splat(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v4i64_const_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvrepli.d $xr0, 1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v2f32_const_splat(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v2f32_const_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    lu12i.w $a1, 260096
++; CHECK-NEXT:    xvreplgr2vr.w $xr0, $a1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v4f64_const_splat(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v4f64_const_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    lu52i.d $a1, $zero, 1023
++; CHECK-NEXT:    xvreplgr2vr.d $xr0, $a1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v32i8_const(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v32i8_const:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI12_0)
++; CHECK-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI12_0)
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v16i16_const(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v16i16_const:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI13_0)
++; CHECK-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI13_0)
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v8i32_const(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v8i32_const:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI14_0)
++; CHECK-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI14_0)
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v4i64_const(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v4i64_const:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI15_0)
++; CHECK-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI15_0)
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <4 x i64> <i64 0, i64 1, i64 2, i64 3>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v2f32_const(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v2f32_const:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI16_0)
++; CHECK-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI16_0)
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <8 x float> <float 0.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v4f64_const(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v4f64_const:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI17_0)
++; CHECK-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI17_0)
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <4 x double> <double 0.0, double 1.0, double 2.0, double 3.0>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v32i8(ptr %dst, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
++; CHECK-LABEL: buildvector_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 0
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a2, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a3, 2
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a4, 3
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a5, 4
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a6, 5
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a7, 6
++; CHECK-NEXT:    ld.b $a1, $sp, 0
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 7
++; CHECK-NEXT:    ld.b $a1, $sp, 8
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 8
++; CHECK-NEXT:    ld.b $a1, $sp, 16
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 9
++; CHECK-NEXT:    ld.b $a1, $sp, 24
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 10
++; CHECK-NEXT:    ld.b $a1, $sp, 32
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 11
++; CHECK-NEXT:    ld.b $a1, $sp, 40
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 12
++; CHECK-NEXT:    ld.b $a1, $sp, 48
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 13
++; CHECK-NEXT:    ld.b $a1, $sp, 56
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 14
++; CHECK-NEXT:    ld.b $a1, $sp, 64
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 15
++; CHECK-NEXT:    ld.b $a1, $sp, 72
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 0
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.b $a1, $sp, 80
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 1
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.b $a1, $sp, 88
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 2
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.b $a1, $sp, 96
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 3
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.b $a1, $sp, 104
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 4
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.b $a1, $sp, 112
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 5
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.b $a1, $sp, 120
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 6
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.b $a1, $sp, 128
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 7
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.b $a1, $sp, 136
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 8
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.b $a1, $sp, 144
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 9
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.b $a1, $sp, 152
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 10
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.b $a1, $sp, 160
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 11
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.b $a1, $sp, 168
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 12
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.b $a1, $sp, 176
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 13
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.b $a1, $sp, 184
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 14
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.b $a1, $sp, 192
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 15
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %ins0  = insertelement <32 x i8> undef,  i8 %a0,  i32 0
++  %ins1  = insertelement <32 x i8> %ins0,  i8 %a1,  i32 1
++  %ins2  = insertelement <32 x i8> %ins1,  i8 %a2,  i32 2
++  %ins3  = insertelement <32 x i8> %ins2,  i8 %a3,  i32 3
++  %ins4  = insertelement <32 x i8> %ins3,  i8 %a4,  i32 4
++  %ins5  = insertelement <32 x i8> %ins4,  i8 %a5,  i32 5
++  %ins6  = insertelement <32 x i8> %ins5,  i8 %a6,  i32 6
++  %ins7  = insertelement <32 x i8> %ins6,  i8 %a7,  i32 7
++  %ins8  = insertelement <32 x i8> %ins7,  i8 %a8,  i32 8
++  %ins9  = insertelement <32 x i8> %ins8,  i8 %a9,  i32 9
++  %ins10 = insertelement <32 x i8> %ins9,  i8 %a10, i32 10
++  %ins11 = insertelement <32 x i8> %ins10, i8 %a11, i32 11
++  %ins12 = insertelement <32 x i8> %ins11, i8 %a12, i32 12
++  %ins13 = insertelement <32 x i8> %ins12, i8 %a13, i32 13
++  %ins14 = insertelement <32 x i8> %ins13, i8 %a14, i32 14
++  %ins15 = insertelement <32 x i8> %ins14, i8 %a15, i32 15
++  %ins16 = insertelement <32 x i8> %ins15, i8 %a16, i32 16
++  %ins17 = insertelement <32 x i8> %ins16, i8 %a17, i32 17
++  %ins18 = insertelement <32 x i8> %ins17, i8 %a18, i32 18
++  %ins19 = insertelement <32 x i8> %ins18, i8 %a19, i32 19
++  %ins20 = insertelement <32 x i8> %ins19, i8 %a20, i32 20
++  %ins21 = insertelement <32 x i8> %ins20, i8 %a21, i32 21
++  %ins22 = insertelement <32 x i8> %ins21, i8 %a22, i32 22
++  %ins23 = insertelement <32 x i8> %ins22, i8 %a23, i32 23
++  %ins24 = insertelement <32 x i8> %ins23, i8 %a24, i32 24
++  %ins25 = insertelement <32 x i8> %ins24, i8 %a25, i32 25
++  %ins26 = insertelement <32 x i8> %ins25, i8 %a26, i32 26
++  %ins27 = insertelement <32 x i8> %ins26, i8 %a27, i32 27
++  %ins28 = insertelement <32 x i8> %ins27, i8 %a28, i32 28
++  %ins29 = insertelement <32 x i8> %ins28, i8 %a29, i32 29
++  %ins30 = insertelement <32 x i8> %ins29, i8 %a30, i32 30
++  %ins31 = insertelement <32 x i8> %ins30, i8 %a31, i32 31
++  store <32 x i8> %ins31, ptr %dst
++  ret void
++}
++
++define void @buildvector_v16i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
++; CHECK-LABEL: buildvector_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 0
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a2, 1
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a3, 2
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a4, 3
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a5, 4
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a6, 5
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a7, 6
++; CHECK-NEXT:    ld.h $a1, $sp, 0
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 7
++; CHECK-NEXT:    ld.h $a1, $sp, 8
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 0
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.h $a1, $sp, 16
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 1
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.h $a1, $sp, 24
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 2
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.h $a1, $sp, 32
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 3
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.h $a1, $sp, 40
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 4
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.h $a1, $sp, 48
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 5
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.h $a1, $sp, 56
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 6
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    ld.h $a1, $sp, 64
++; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
++; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
++; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 7
++; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %ins0  = insertelement <16 x i16> undef,  i16 %a0,  i32 0
++  %ins1  = insertelement <16 x i16> %ins0,  i16 %a1,  i32 1
++  %ins2  = insertelement <16 x i16> %ins1,  i16 %a2,  i32 2
++  %ins3  = insertelement <16 x i16> %ins2,  i16 %a3,  i32 3
++  %ins4  = insertelement <16 x i16> %ins3,  i16 %a4,  i32 4
++  %ins5  = insertelement <16 x i16> %ins4,  i16 %a5,  i32 5
++  %ins6  = insertelement <16 x i16> %ins5,  i16 %a6,  i32 6
++  %ins7  = insertelement <16 x i16> %ins6,  i16 %a7,  i32 7
++  %ins8  = insertelement <16 x i16> %ins7,  i16 %a8,  i32 8
++  %ins9  = insertelement <16 x i16> %ins8,  i16 %a9,  i32 9
++  %ins10 = insertelement <16 x i16> %ins9,  i16 %a10, i32 10
++  %ins11 = insertelement <16 x i16> %ins10, i16 %a11, i32 11
++  %ins12 = insertelement <16 x i16> %ins11, i16 %a12, i32 12
++  %ins13 = insertelement <16 x i16> %ins12, i16 %a13, i32 13
++  %ins14 = insertelement <16 x i16> %ins13, i16 %a14, i32 14
++  %ins15 = insertelement <16 x i16> %ins14, i16 %a15, i32 15
++  store <16 x i16> %ins15, ptr %dst
++  ret void
++}
++
++define void @buildvector_v8i32(ptr %dst, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
++; CHECK-LABEL: buildvector_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 0
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a2, 1
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a3, 2
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a4, 3
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a5, 4
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a6, 5
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a7, 6
++; CHECK-NEXT:    ld.w $a1, $sp, 0
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 7
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %ins0 = insertelement <8 x i32> undef, i32 %a0, i32 0
++  %ins1 = insertelement <8 x i32> %ins0, i32 %a1, i32 1
++  %ins2 = insertelement <8 x i32> %ins1, i32 %a2, i32 2
++  %ins3 = insertelement <8 x i32> %ins2, i32 %a3, i32 3
++  %ins4 = insertelement <8 x i32> %ins3, i32 %a4, i32 4
++  %ins5 = insertelement <8 x i32> %ins4, i32 %a5, i32 5
++  %ins6 = insertelement <8 x i32> %ins5, i32 %a6, i32 6
++  %ins7 = insertelement <8 x i32> %ins6, i32 %a7, i32 7
++  store <8 x i32> %ins7, ptr %dst
++  ret void
++}
++
++define void @buildvector_v4i64(ptr %dst, i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
++; CHECK-LABEL: buildvector_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a1, 0
++; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a2, 1
++; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a3, 2
++; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a4, 3
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %ins0 = insertelement <4 x i64> undef, i64 %a0, i32 0
++  %ins1 = insertelement <4 x i64> %ins0, i64 %a1, i32 1
++  %ins2 = insertelement <4 x i64> %ins1, i64 %a2, i32 2
++  %ins3 = insertelement <4 x i64> %ins2, i64 %a3, i32 3
++  store <4 x i64> %ins3, ptr %dst
++  ret void
++}
++
++define void @buildvector_v8f32(ptr %dst, float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
++; CHECK-LABEL: buildvector_v8f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    movfr2gr.s $a1, $fa0
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 0
++; CHECK-NEXT:    movfr2gr.s $a1, $fa1
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 1
++; CHECK-NEXT:    movfr2gr.s $a1, $fa2
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 2
++; CHECK-NEXT:    movfr2gr.s $a1, $fa3
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 3
++; CHECK-NEXT:    movfr2gr.s $a1, $fa4
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 4
++; CHECK-NEXT:    movfr2gr.s $a1, $fa5
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 5
++; CHECK-NEXT:    movfr2gr.s $a1, $fa6
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 6
++; CHECK-NEXT:    movfr2gr.s $a1, $fa7
++; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a1, 7
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %ins0 = insertelement <8 x float> undef, float %a0, i32 0
++  %ins1 = insertelement <8 x float> %ins0, float %a1, i32 1
++  %ins2 = insertelement <8 x float> %ins1, float %a2, i32 2
++  %ins3 = insertelement <8 x float> %ins2, float %a3, i32 3
++  %ins4 = insertelement <8 x float> %ins3, float %a4, i32 4
++  %ins5 = insertelement <8 x float> %ins4, float %a5, i32 5
++  %ins6 = insertelement <8 x float> %ins5, float %a6, i32 6
++  %ins7 = insertelement <8 x float> %ins6, float %a7, i32 7
++  store <8 x float> %ins7, ptr %dst
++  ret void
++}
++
++define void @buildvector_v4f64(ptr %dst, double %a0, double %a1, double %a2, double %a3) nounwind {
++; CHECK-LABEL: buildvector_v4f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    movfr2gr.d $a1, $fa0
++; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a1, 0
++; CHECK-NEXT:    movfr2gr.d $a1, $fa1
++; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a1, 1
++; CHECK-NEXT:    movfr2gr.d $a1, $fa2
++; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a1, 2
++; CHECK-NEXT:    movfr2gr.d $a1, $fa3
++; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a1, 3
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %ins0 = insertelement <4 x double> undef, double %a0, i32 0
++  %ins1 = insertelement <4 x double> %ins0, double %a1, i32 1
++  %ins2 = insertelement <4 x double> %ins1, double %a2, i32 2
++  %ins3 = insertelement <4 x double> %ins2, double %a3, i32 3
++  store <4 x double> %ins3, ptr %dst
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+new file mode 100644
+index 000000000000..3a74db5e1acb
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+@@ -0,0 +1,376 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @buildvector_v16i8_splat(ptr %dst, i8 %a0) nounwind {
++; CHECK-LABEL: buildvector_v16i8_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vreplgr2vr.b $vr0, $a1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %insert = insertelement <16 x i8> undef, i8 %a0, i8 0
++  %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
++  store <16 x i8> %splat, ptr %dst
++  ret void
++}
++
++define void @buildvector_v8i16_splat(ptr %dst, i16 %a0) nounwind {
++; CHECK-LABEL: buildvector_v8i16_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vreplgr2vr.h $vr0, $a1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %insert = insertelement <8 x i16> undef, i16 %a0, i8 0
++  %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
++  store <8 x i16> %splat, ptr %dst
++  ret void
++}
++
++define void @buildvector_v4i32_splat(ptr %dst, i32 %a0) nounwind {
++; CHECK-LABEL: buildvector_v4i32_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vreplgr2vr.w $vr0, $a1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %insert = insertelement <4 x i32> undef, i32 %a0, i8 0
++  %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
++  store <4 x i32> %splat, ptr %dst
++  ret void
++}
++
++define void @buildvector_v2i64_splat(ptr %dst, i64 %a0) nounwind {
++; CHECK-LABEL: buildvector_v2i64_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vreplgr2vr.d $vr0, $a1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %insert = insertelement <2 x i64> undef, i64 %a0, i8 0
++  %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
++  store <2 x i64> %splat, ptr %dst
++  ret void
++}
++
++define void @buildvector_v4f32_splat(ptr %dst, float %a0) nounwind {
++; CHECK-LABEL: buildvector_v4f32_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0
++; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %insert = insertelement <4 x float> undef, float %a0, i8 0
++  %splat = shufflevector <4 x float> %insert, <4 x float> undef, <4 x i32> zeroinitializer
++  store <4 x float> %splat, ptr %dst
++  ret void
++}
++
++define void @buildvector_v2f64_splat(ptr %dst, double %a0) nounwind {
++; CHECK-LABEL: buildvector_v2f64_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
++; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %insert = insertelement <2 x double> undef, double %a0, i8 0
++  %splat = shufflevector <2 x double> %insert, <2 x double> undef, <2 x i32> zeroinitializer
++  store <2 x double> %splat, ptr %dst
++  ret void
++}
++
++define void @buildvector_v16i8_const_splat(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v16i8_const_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vrepli.b $vr0, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v8i16_const_splat(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v8i16_const_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vrepli.h $vr0, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v4i32_const_splat(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v4i32_const_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vrepli.w $vr0, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v2i64_const_splat(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v2i64_const_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vrepli.d $vr0, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <2 x i64> <i64 1, i64 1>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v2f32_const_splat(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v2f32_const_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    lu12i.w $a1, 260096
++; CHECK-NEXT:    vreplgr2vr.w $vr0, $a1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v2f64_const_splat(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v2f64_const_splat:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    lu52i.d $a1, $zero, 1023
++; CHECK-NEXT:    vreplgr2vr.d $vr0, $a1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <2 x double> <double 1.0, double 1.0>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v16i8_const(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v16i8_const:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI12_0)
++; CHECK-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI12_0)
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v8i16_const(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v8i16_const:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI13_0)
++; CHECK-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI13_0)
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v4i32_const(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v4i32_const:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI14_0)
++; CHECK-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI14_0)
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v2i64_const(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v2i64_const:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI15_0)
++; CHECK-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI15_0)
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <2 x i64> <i64 0, i64 1>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v2f32_const(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v2f32_const:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI16_0)
++; CHECK-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI16_0)
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <4 x float> <float 0.0, float 1.0, float 2.0, float 3.0>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v2f64_const(ptr %dst) nounwind {
++; CHECK-LABEL: buildvector_v2f64_const:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI17_0)
++; CHECK-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI17_0)
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  store <2 x double> <double 0.0, double 1.0>, ptr %dst
++  ret void
++}
++
++define void @buildvector_v16i8(ptr %dst, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15) nounwind {
++; CHECK-LABEL: buildvector_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 0
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a2, 1
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a3, 2
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a4, 3
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a5, 4
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a6, 5
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a7, 6
++; CHECK-NEXT:    ld.b $a1, $sp, 0
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 7
++; CHECK-NEXT:    ld.b $a1, $sp, 8
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 8
++; CHECK-NEXT:    ld.b $a1, $sp, 16
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 9
++; CHECK-NEXT:    ld.b $a1, $sp, 24
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 10
++; CHECK-NEXT:    ld.b $a1, $sp, 32
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 11
++; CHECK-NEXT:    ld.b $a1, $sp, 40
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 12
++; CHECK-NEXT:    ld.b $a1, $sp, 48
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 13
++; CHECK-NEXT:    ld.b $a1, $sp, 56
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 14
++; CHECK-NEXT:    ld.b $a1, $sp, 64
++; CHECK-NEXT:    vinsgr2vr.b $vr0, $a1, 15
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %ins0  = insertelement <16 x i8> undef,  i8 %a0,  i32 0
++  %ins1  = insertelement <16 x i8> %ins0,  i8 %a1,  i32 1
++  %ins2  = insertelement <16 x i8> %ins1,  i8 %a2,  i32 2
++  %ins3  = insertelement <16 x i8> %ins2,  i8 %a3,  i32 3
++  %ins4  = insertelement <16 x i8> %ins3,  i8 %a4,  i32 4
++  %ins5  = insertelement <16 x i8> %ins4,  i8 %a5,  i32 5
++  %ins6  = insertelement <16 x i8> %ins5,  i8 %a6,  i32 6
++  %ins7  = insertelement <16 x i8> %ins6,  i8 %a7,  i32 7
++  %ins8  = insertelement <16 x i8> %ins7,  i8 %a8,  i32 8
++  %ins9  = insertelement <16 x i8> %ins8,  i8 %a9,  i32 9
++  %ins10 = insertelement <16 x i8> %ins9,  i8 %a10, i32 10
++  %ins11 = insertelement <16 x i8> %ins10, i8 %a11, i32 11
++  %ins12 = insertelement <16 x i8> %ins11, i8 %a12, i32 12
++  %ins13 = insertelement <16 x i8> %ins12, i8 %a13, i32 13
++  %ins14 = insertelement <16 x i8> %ins13, i8 %a14, i32 14
++  %ins15 = insertelement <16 x i8> %ins14, i8 %a15, i32 15
++  store <16 x i8> %ins15, ptr %dst
++  ret void
++}
++
++define void @buildvector_v8i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
++; CHECK-LABEL: buildvector_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 0
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a2, 1
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a3, 2
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a4, 3
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a5, 4
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a6, 5
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a7, 6
++; CHECK-NEXT:    ld.h $a1, $sp, 0
++; CHECK-NEXT:    vinsgr2vr.h $vr0, $a1, 7
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %ins0 = insertelement <8 x i16> undef, i16 %a0, i32 0
++  %ins1 = insertelement <8 x i16> %ins0, i16 %a1, i32 1
++  %ins2 = insertelement <8 x i16> %ins1, i16 %a2, i32 2
++  %ins3 = insertelement <8 x i16> %ins2, i16 %a3, i32 3
++  %ins4 = insertelement <8 x i16> %ins3, i16 %a4, i32 4
++  %ins5 = insertelement <8 x i16> %ins4, i16 %a5, i32 5
++  %ins6 = insertelement <8 x i16> %ins5, i16 %a6, i32 6
++  %ins7 = insertelement <8 x i16> %ins6, i16 %a7, i32 7
++  store <8 x i16> %ins7, ptr %dst
++  ret void
++}
++
++define void @buildvector_v4i32(ptr %dst, i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
++; CHECK-LABEL: buildvector_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vinsgr2vr.w $vr0, $a1, 0
++; CHECK-NEXT:    vinsgr2vr.w $vr0, $a2, 1
++; CHECK-NEXT:    vinsgr2vr.w $vr0, $a3, 2
++; CHECK-NEXT:    vinsgr2vr.w $vr0, $a4, 3
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %ins0 = insertelement <4 x i32> undef, i32 %a0, i32 0
++  %ins1 = insertelement <4 x i32> %ins0, i32 %a1, i32 1
++  %ins2 = insertelement <4 x i32> %ins1, i32 %a2, i32 2
++  %ins3 = insertelement <4 x i32> %ins2, i32 %a3, i32 3
++  store <4 x i32> %ins3, ptr %dst
++  ret void
++}
++
++define void @buildvector_v2i64(ptr %dst, i64 %a0, i64 %a1) nounwind {
++; CHECK-LABEL: buildvector_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vinsgr2vr.d $vr0, $a1, 0
++; CHECK-NEXT:    vinsgr2vr.d $vr0, $a2, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %ins0 = insertelement <2 x i64> undef, i64 %a0, i32 0
++  %ins1 = insertelement <2 x i64> %ins0, i64 %a1, i32 1
++  store <2 x i64> %ins1, ptr %dst
++  ret void
++}
++
++define void @buildvector_v4f32(ptr %dst, float %a0, float %a1, float %a2, float %a3) nounwind {
++; CHECK-LABEL: buildvector_v4f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    movfr2gr.s $a1, $fa0
++; CHECK-NEXT:    vinsgr2vr.w $vr0, $a1, 0
++; CHECK-NEXT:    movfr2gr.s $a1, $fa1
++; CHECK-NEXT:    vinsgr2vr.w $vr0, $a1, 1
++; CHECK-NEXT:    movfr2gr.s $a1, $fa2
++; CHECK-NEXT:    vinsgr2vr.w $vr0, $a1, 2
++; CHECK-NEXT:    movfr2gr.s $a1, $fa3
++; CHECK-NEXT:    vinsgr2vr.w $vr0, $a1, 3
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %ins0 = insertelement <4 x float> undef, float %a0, i32 0
++  %ins1 = insertelement <4 x float> %ins0, float %a1, i32 1
++  %ins2 = insertelement <4 x float> %ins1, float %a2, i32 2
++  %ins3 = insertelement <4 x float> %ins2, float %a3, i32 3
++  store <4 x float> %ins3, ptr %dst
++  ret void
++}
++
++define void @buildvector_v2f64(ptr %dst, double %a0, double %a1) nounwind {
++; CHECK-LABEL: buildvector_v2f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    movfr2gr.d $a1, $fa0
++; CHECK-NEXT:    vinsgr2vr.d $vr0, $a1, 0
++; CHECK-NEXT:    movfr2gr.d $a1, $fa1
++; CHECK-NEXT:    vinsgr2vr.d $vr0, $a1, 1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %ins0 = insertelement <2 x double> undef, double %a0, i32 0
++  %ins1 = insertelement <2 x double> %ins0, double %a1, i32 1
++  store <2 x double> %ins1, ptr %dst
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mul.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mul.ll
+index 5060240cd8b1..d0be9cb7e3c8 100644
+--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mul.ll
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mul.ll
+@@ -180,10 +180,9 @@ entry:
+ define void @mul_v16i8_17(ptr %res, ptr %a0) nounwind {
+ ; CHECK-LABEL: mul_v16i8_17:
+ ; CHECK:       # %bb.0: # %entry
+-; CHECK-NEXT:    ori $a2, $zero, 17
+-; CHECK-NEXT:    vreplgr2vr.b $vr0, $a2
+-; CHECK-NEXT:    vld $vr1, $a1, 0
+-; CHECK-NEXT:    vmul.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vrepli.b $vr1, 17
++; CHECK-NEXT:    vmul.b $vr0, $vr0, $vr1
+ ; CHECK-NEXT:    vst $vr0, $a0, 0
+ ; CHECK-NEXT:    ret
+ entry:
+@@ -196,10 +195,9 @@ entry:
+ define void @mul_v8i16_17(ptr %res, ptr %a0) nounwind {
+ ; CHECK-LABEL: mul_v8i16_17:
+ ; CHECK:       # %bb.0: # %entry
+-; CHECK-NEXT:    ori $a2, $zero, 17
+-; CHECK-NEXT:    vreplgr2vr.h $vr0, $a2
+-; CHECK-NEXT:    vld $vr1, $a1, 0
+-; CHECK-NEXT:    vmul.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vrepli.h $vr1, 17
++; CHECK-NEXT:    vmul.h $vr0, $vr0, $vr1
+ ; CHECK-NEXT:    vst $vr0, $a0, 0
+ ; CHECK-NEXT:    ret
+ entry:
+@@ -212,10 +210,9 @@ entry:
+ define void @mul_v4i32_17(ptr %res, ptr %a0) nounwind {
+ ; CHECK-LABEL: mul_v4i32_17:
+ ; CHECK:       # %bb.0: # %entry
+-; CHECK-NEXT:    ori $a2, $zero, 17
+-; CHECK-NEXT:    vreplgr2vr.w $vr0, $a2
+-; CHECK-NEXT:    vld $vr1, $a1, 0
+-; CHECK-NEXT:    vmul.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vrepli.w $vr1, 17
++; CHECK-NEXT:    vmul.w $vr0, $vr0, $vr1
+ ; CHECK-NEXT:    vst $vr0, $a0, 0
+ ; CHECK-NEXT:    ret
+ entry:
+@@ -228,10 +225,9 @@ entry:
+ define void @mul_v2i64_17(ptr %res, ptr %a0) nounwind {
+ ; CHECK-LABEL: mul_v2i64_17:
+ ; CHECK:       # %bb.0: # %entry
+-; CHECK-NEXT:    ori $a2, $zero, 17
+-; CHECK-NEXT:    vreplgr2vr.d $vr0, $a2
+-; CHECK-NEXT:    vld $vr1, $a1, 0
+-; CHECK-NEXT:    vmul.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vrepli.d $vr1, 17
++; CHECK-NEXT:    vmul.d $vr0, $vr0, $vr1
+ ; CHECK-NEXT:    vst $vr0, $a0, 0
+ ; CHECK-NEXT:    ret
+ entry:
+-- 
+2.20.1
+
+
+From 62970fc545cedb4640ded25af832fd233c16dc85 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Tue, 14 Nov 2023 17:58:52 +0800
+Subject: [PATCH 17/35] [LoongArch] Add more and/or/xor patterns for vector
+ types
+
+(cherry picked from commit ca66df3b021017fedf08f0779f5bfc7898dbdd29)
+---
+ .../LoongArch/LoongArchLASXInstrInfo.td       |  21 +--
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td |  21 +--
+ .../LoongArch/lasx/ir-instruction/and.ll      | 125 ++++++++++++++++++
+ .../LoongArch/lasx/ir-instruction/or.ll       | 125 ++++++++++++++++++
+ .../LoongArch/lasx/ir-instruction/xor.ll      | 125 ++++++++++++++++++
+ .../LoongArch/lsx/ir-instruction/and.ll       | 125 ++++++++++++++++++
+ .../LoongArch/lsx/ir-instruction/or.ll        | 125 ++++++++++++++++++
+ .../LoongArch/lsx/ir-instruction/xor.ll       | 125 ++++++++++++++++++
+ 8 files changed, 774 insertions(+), 18 deletions(-)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/and.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/or.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/xor.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/and.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/or.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/xor.ll
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index 4487152fb42b..a5652472481a 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -1184,10 +1184,6 @@ multiclass PatShiftXrUimm<SDPatternOperator OpNode, string Inst> {
+             (!cast<LAInst>(Inst#"_D") LASX256:$xj, uimm6:$imm)>;
+ }
+ 
+-class PatXrXrB<SDPatternOperator OpNode, LAInst Inst>
+-    : Pat<(OpNode (v32i8 LASX256:$xj), (v32i8 LASX256:$xk)),
+-          (Inst LASX256:$xj, LASX256:$xk)>;
+-
+ let Predicates = [HasExtLASX] in {
+ 
+ // XVADD_{B/H/W/D}
+@@ -1235,13 +1231,20 @@ defm : PatXrXr<srem, "XVMOD">;
+ defm : PatXrXrU<urem, "XVMOD">;
+ 
+ // XVAND_V
+-def : PatXrXrB<and, XVAND_V>;
+-// XVNOR_V
+-def : PatXrXrB<or, XVOR_V>;
++foreach vt = [v32i8, v16i16, v8i32, v4i64] in
++def : Pat<(and (vt LASX256:$xj), (vt LASX256:$xk)),
++          (XVAND_V LASX256:$xj, LASX256:$xk)>;
++// XVOR_V
++foreach vt = [v32i8, v16i16, v8i32, v4i64] in
++def : Pat<(or (vt LASX256:$xj), (vt LASX256:$xk)),
++          (XVOR_V LASX256:$xj, LASX256:$xk)>;
+ // XVXOR_V
+-def : PatXrXrB<xor, XVXOR_V>;
++foreach vt = [v32i8, v16i16, v8i32, v4i64] in
++def : Pat<(xor (vt LASX256:$xj), (vt LASX256:$xk)),
++          (XVXOR_V LASX256:$xj, LASX256:$xk)>;
+ // XVNOR_V
+-def : Pat<(vnot (or (v32i8 LASX256:$xj), (v32i8 LASX256:$xk))),
++foreach vt = [v32i8, v16i16, v8i32, v4i64] in
++def : Pat<(vnot (or (vt LASX256:$xj), (vt LASX256:$xk))),
+           (XVNOR_V LASX256:$xj, LASX256:$xk)>;
+ 
+ // XVANDI_B
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index deac5015882d..5645ce51194a 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -1261,10 +1261,6 @@ multiclass PatShiftVrUimm<SDPatternOperator OpNode, string Inst> {
+             (!cast<LAInst>(Inst#"_D") LSX128:$vj, uimm6:$imm)>;
+ }
+ 
+-class PatVrVrB<SDPatternOperator OpNode, LAInst Inst>
+-    : Pat<(OpNode (v16i8 LSX128:$vj), (v16i8 LSX128:$vk)),
+-          (Inst LSX128:$vj, LSX128:$vk)>;
+-
+ let Predicates = [HasExtLSX] in {
+ 
+ // VADD_{B/H/W/D}
+@@ -1312,13 +1308,20 @@ defm : PatVrVr<srem, "VMOD">;
+ defm : PatVrVrU<urem, "VMOD">;
+ 
+ // VAND_V
+-def : PatVrVrB<and, VAND_V>;
+-// VNOR_V
+-def : PatVrVrB<or, VOR_V>;
++foreach vt = [v16i8, v8i16, v4i32, v2i64] in
++def : Pat<(and (vt LSX128:$vj), (vt LSX128:$vk)),
++          (VAND_V LSX128:$vj, LSX128:$vk)>;
++// VOR_V
++foreach vt = [v16i8, v8i16, v4i32, v2i64] in
++def : Pat<(or (vt LSX128:$vj), (vt LSX128:$vk)),
++          (VOR_V LSX128:$vj, LSX128:$vk)>;
+ // VXOR_V
+-def : PatVrVrB<xor, VXOR_V>;
++foreach vt = [v16i8, v8i16, v4i32, v2i64] in
++def : Pat<(xor (vt LSX128:$vj), (vt LSX128:$vk)),
++          (VXOR_V LSX128:$vj, LSX128:$vk)>;
+ // VNOR_V
+-def : Pat<(vnot (or (v16i8 LSX128:$vj), (v16i8 LSX128:$vk))),
++foreach vt = [v16i8, v8i16, v4i32, v2i64] in
++def : Pat<(vnot (or (vt LSX128:$vj), (vt LSX128:$vk))),
+           (VNOR_V LSX128:$vj, LSX128:$vk)>;
+ 
+ // VANDI_B
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/and.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/and.ll
+new file mode 100644
+index 000000000000..98c87cadeeb5
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/and.ll
+@@ -0,0 +1,125 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @and_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: and_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvand.v $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %v2 = and <32 x i8> %v0, %v1
++  store <32 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @and_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: and_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvand.v $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %v2 = and <16 x i16> %v0, %v1
++  store <16 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @and_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: and_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvand.v $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %v2 = and <8 x i32> %v0, %v1
++  store <8 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @and_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: and_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvand.v $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %v2 = and <4 x i64> %v0, %v1
++  store <4 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @and_u_v32i8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: and_u_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvandi.b $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = and <32 x i8> %v0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
++  store <32 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @and_u_v16i16(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: and_u_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvrepli.h $xr1, 31
++; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = and <16 x i16> %v0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
++  store <16 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @and_u_v8i32(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: and_u_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvrepli.w $xr1, 31
++; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = and <8 x i32> %v0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @and_u_v4i64(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: and_u_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvrepli.d $xr1, 31
++; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = and <4 x i64> %v0, <i64 31, i64 31, i64 31, i64 31>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/or.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/or.ll
+new file mode 100644
+index 000000000000..f37cbf1cefed
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/or.ll
+@@ -0,0 +1,125 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @or_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: or_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %v2 = or <32 x i8> %v0, %v1
++  store <32 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @or_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: or_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %v2 = or <16 x i16> %v0, %v1
++  store <16 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @or_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: or_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %v2 = or <8 x i32> %v0, %v1
++  store <8 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @or_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: or_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %v2 = or <4 x i64> %v0, %v1
++  store <4 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @or_u_v32i8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: or_u_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvori.b $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = or <32 x i8> %v0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
++  store <32 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @or_u_v16i16(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: or_u_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvrepli.h $xr1, 31
++; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = or <16 x i16> %v0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
++  store <16 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @or_u_v8i32(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: or_u_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvrepli.w $xr1, 31
++; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = or <8 x i32> %v0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @or_u_v4i64(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: or_u_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvrepli.d $xr1, 31
++; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = or <4 x i64> %v0, <i64 31, i64 31, i64 31, i64 31>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/xor.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/xor.ll
+new file mode 100644
+index 000000000000..c2fb1462b7a2
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/xor.ll
+@@ -0,0 +1,125 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @xor_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: xor_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvxor.v $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %v2 = xor <32 x i8> %v0, %v1
++  store <32 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @xor_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: xor_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvxor.v $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %v2 = xor <16 x i16> %v0, %v1
++  store <16 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @xor_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: xor_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvxor.v $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %v2 = xor <8 x i32> %v0, %v1
++  store <8 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @xor_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: xor_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvxor.v $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %v2 = xor <4 x i64> %v0, %v1
++  store <4 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @xor_u_v32i8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: xor_u_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvxori.b $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = xor <32 x i8> %v0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
++  store <32 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @xor_u_v16i16(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: xor_u_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvrepli.h $xr1, 31
++; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = xor <16 x i16> %v0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
++  store <16 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @xor_u_v8i32(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: xor_u_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvrepli.w $xr1, 31
++; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = xor <8 x i32> %v0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @xor_u_v4i64(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: xor_u_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvrepli.d $xr1, 31
++; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = xor <4 x i64> %v0, <i64 31, i64 31, i64 31, i64 31>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/and.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/and.ll
+new file mode 100644
+index 000000000000..523255159a81
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/and.ll
+@@ -0,0 +1,125 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @and_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: and_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vand.v $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %v2 = and <16 x i8> %v0, %v1
++  store <16 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @and_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: and_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vand.v $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %v2 = and <8 x i16> %v0, %v1
++  store <8 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @and_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: and_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vand.v $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %v2 = and <4 x i32> %v0, %v1
++  store <4 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @and_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: and_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vand.v $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %v2 = and <2 x i64> %v0, %v1
++  store <2 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @and_u_v16i8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: and_u_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vandi.b $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = and <16 x i8> %v0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
++  store <16 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @and_u_v8i16(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: and_u_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vrepli.h $vr1, 31
++; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = and <8 x i16> %v0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
++  store <8 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @and_u_v4i32(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: and_u_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vrepli.w $vr1, 31
++; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = and <4 x i32> %v0, <i32 31, i32 31, i32 31, i32 31>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @and_u_v2i64(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: and_u_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vrepli.d $vr1, 31
++; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = and <2 x i64> %v0, <i64 31, i64 31>
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/or.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/or.ll
+new file mode 100644
+index 000000000000..f124512acce7
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/or.ll
+@@ -0,0 +1,125 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @or_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: or_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vor.v $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %v2 = or <16 x i8> %v0, %v1
++  store <16 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @or_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: or_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vor.v $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %v2 = or <8 x i16> %v0, %v1
++  store <8 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @or_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: or_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vor.v $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %v2 = or <4 x i32> %v0, %v1
++  store <4 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @or_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: or_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vor.v $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %v2 = or <2 x i64> %v0, %v1
++  store <2 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @or_u_v16i8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: or_u_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vori.b $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = or <16 x i8> %v0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
++  store <16 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @or_u_v8i16(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: or_u_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vrepli.h $vr1, 31
++; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = or <8 x i16> %v0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
++  store <8 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @or_u_v4i32(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: or_u_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vrepli.w $vr1, 31
++; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = or <4 x i32> %v0, <i32 31, i32 31, i32 31, i32 31>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @or_u_v2i64(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: or_u_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vrepli.d $vr1, 31
++; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = or <2 x i64> %v0, <i64 31, i64 31>
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/xor.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/xor.ll
+new file mode 100644
+index 000000000000..ce3e49c990ff
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/xor.ll
+@@ -0,0 +1,125 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @xor_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: xor_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vxor.v $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %v2 = xor <16 x i8> %v0, %v1
++  store <16 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @xor_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: xor_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vxor.v $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %v2 = xor <8 x i16> %v0, %v1
++  store <8 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @xor_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: xor_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vxor.v $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %v2 = xor <4 x i32> %v0, %v1
++  store <4 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @xor_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: xor_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vxor.v $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %v2 = xor <2 x i64> %v0, %v1
++  store <2 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @xor_u_v16i8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: xor_u_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vxori.b $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = xor <16 x i8> %v0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
++  store <16 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @xor_u_v8i16(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: xor_u_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vrepli.h $vr1, 31
++; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = xor <8 x i16> %v0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
++  store <8 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @xor_u_v4i32(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: xor_u_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vrepli.w $vr1, 31
++; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = xor <4 x i32> %v0, <i32 31, i32 31, i32 31, i32 31>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @xor_u_v2i64(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: xor_u_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vrepli.d $vr1, 31
++; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = xor <2 x i64> %v0, <i64 31, i64 31>
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
+-- 
+2.20.1
+
+
+From f33b8ed69368098a23f9f14a1d3b8d62aca8b48f Mon Sep 17 00:00:00 2001
+From: leecheechen <chenli@loongson.cn>
+Date: Fri, 1 Dec 2023 13:14:11 +0800
+Subject: [PATCH 18/35] [LoongArch] Add some binary IR instructions testcases
+ for LASX (#74031)
+
+The IR instructions include:
+- Binary Operations: add fadd sub fsub mul fmul udiv sdiv fdiv
+- Bitwise Binary Operations: shl lshr ashr
+
+(cherry picked from commit dbbc7c31c8e55d72dc243b244e386a25132e7215)
+---
+ .../LoongArch/lasx/ir-instruction/add.ll      | 122 +++++++++
+ .../LoongArch/lasx/ir-instruction/ashr.ll     | 178 +++++++++++++
+ .../LoongArch/lasx/ir-instruction/fadd.ll     |  34 +++
+ .../LoongArch/lasx/ir-instruction/fdiv.ll     |  34 +++
+ .../LoongArch/lasx/ir-instruction/fmul.ll     |  34 +++
+ .../LoongArch/lasx/ir-instruction/fsub.ll     |  34 +++
+ .../LoongArch/lasx/ir-instruction/lshr.ll     | 178 +++++++++++++
+ .../LoongArch/lasx/ir-instruction/mul.ll      | 238 ++++++++++++++++++
+ .../LoongArch/lasx/ir-instruction/sdiv.ll     | 134 ++++++++++
+ .../LoongArch/lasx/ir-instruction/shl.ll      | 178 +++++++++++++
+ .../LoongArch/lasx/ir-instruction/sub.ll      | 122 +++++++++
+ .../LoongArch/lasx/ir-instruction/udiv.ll     | 122 +++++++++
+ 12 files changed, 1408 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/add.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/ashr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fadd.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fdiv.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fmul.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fsub.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/lshr.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/mul.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sdiv.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shl.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sub.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/udiv.ll
+
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/add.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/add.ll
+new file mode 100644
+index 000000000000..8e4d0dc6f1c3
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/add.ll
+@@ -0,0 +1,122 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @add_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: add_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvadd.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %v2 = add <32 x i8> %v0, %v1
++  store <32 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @add_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: add_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvadd.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %v2 = add <16 x i16> %v0, %v1
++  store <16 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @add_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: add_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvadd.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %v2 = add <8 x i32> %v0, %v1
++  store <8 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @add_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: add_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvadd.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %v2 = add <4 x i64> %v0, %v1
++  store <4 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @add_v32i8_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: add_v32i8_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvaddi.bu $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = add <32 x i8> %v0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
++  store <32 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @add_v16i16_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: add_v16i16_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvaddi.hu $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = add <16 x i16> %v0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
++  store <16 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @add_v8i32_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: add_v8i32_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvaddi.wu $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = add <8 x i32> %v0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @add_v4i64_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: add_v4i64_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvaddi.du $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = add <4 x i64> %v0, <i64 31, i64 31, i64 31, i64 31>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/ashr.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/ashr.ll
+new file mode 100644
+index 000000000000..fcbf0f1400fe
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/ashr.ll
+@@ -0,0 +1,178 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @ashr_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: ashr_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsra.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %v2 = ashr <32 x i8> %v0, %v1
++  store <32 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @ashr_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: ashr_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsra.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %v2 = ashr <16 x i16> %v0, %v1
++  store <16 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @ashr_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: ashr_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsra.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %v2 = ashr <8 x i32> %v0, %v1
++  store <8 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @ashr_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: ashr_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsra.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %v2 = ashr <4 x i64> %v0, %v1
++  store <4 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @ashr_v32i8_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: ashr_v32i8_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrai.b $xr0, $xr0, 1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = ashr <32 x i8> %v0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
++  store <32 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @ashr_v32i8_7(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: ashr_v32i8_7:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrai.b $xr0, $xr0, 7
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = ashr <32 x i8> %v0, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
++  store <32 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @ashr_v16i16_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: ashr_v16i16_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrai.h $xr0, $xr0, 1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = ashr <16 x i16> %v0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
++  store <16 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @ashr_v16i16_15(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: ashr_v16i16_15:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrai.h $xr0, $xr0, 15
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = ashr <16 x i16> %v0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
++  store <16 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @ashr_v8i32_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: ashr_v8i32_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrai.w $xr0, $xr0, 1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = ashr <8 x i32> %v0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @ashr_v8i32_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: ashr_v8i32_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrai.w $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = ashr <8 x i32> %v0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @ashr_v4i64_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: ashr_v4i64_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrai.d $xr0, $xr0, 1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = ashr <4 x i64> %v0, <i64 1, i64 1, i64 1, i64 1>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
++
++define void @ashr_v4i64_63(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: ashr_v4i64_63:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrai.d $xr0, $xr0, 63
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = ashr <4 x i64> %v0, <i64 63, i64 63, i64 63, i64 63>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fadd.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fadd.ll
+new file mode 100644
+index 000000000000..365bb305fc5a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fadd.ll
+@@ -0,0 +1,34 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @fadd_v8f32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: fadd_v8f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfadd.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = fadd <8 x float> %v0, %v1
++  store <8 x float> %v2, ptr %res
++  ret void
++}
++
++define void @fadd_v4f64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: fadd_v4f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfadd.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = fadd <4 x double> %v0, %v1
++  store <4 x double> %v2, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fdiv.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fdiv.ll
+new file mode 100644
+index 000000000000..284121a79a49
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fdiv.ll
+@@ -0,0 +1,34 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @fdiv_v8f32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: fdiv_v8f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfdiv.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = fdiv <8 x float> %v0, %v1
++  store <8 x float> %v2, ptr %res
++  ret void
++}
++
++define void @fdiv_v4f64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: fdiv_v4f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfdiv.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = fdiv <4 x double> %v0, %v1
++  store <4 x double> %v2, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fmul.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fmul.ll
+new file mode 100644
+index 000000000000..a48dca8d2847
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fmul.ll
+@@ -0,0 +1,34 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @fmul_v8f32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: fmul_v8f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfmul.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = fmul <8 x float> %v0, %v1
++  store <8 x float> %v2, ptr %res
++  ret void
++}
++
++define void @fmul_v4f64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: fmul_v4f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfmul.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = fmul <4 x double> %v0, %v1
++  store <4 x double> %v2, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fsub.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fsub.ll
+new file mode 100644
+index 000000000000..6164aa5a55c7
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fsub.ll
+@@ -0,0 +1,34 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @fsub_v8f32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: fsub_v8f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfsub.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = fsub <8 x float> %v0, %v1
++  store <8 x float> %v2, ptr %res
++  ret void
++}
++
++define void @fsub_v4f64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: fsub_v4f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfsub.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = fsub <4 x double> %v0, %v1
++  store <4 x double> %v2, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/lshr.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/lshr.ll
+new file mode 100644
+index 000000000000..24be69d8032a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/lshr.ll
+@@ -0,0 +1,178 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @lshr_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: lshr_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsrl.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %v2 = lshr <32 x i8> %v0, %v1
++  store <32 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @lshr_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: lshr_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsrl.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %v2 = lshr <16 x i16> %v0, %v1
++  store <16 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @lshr_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: lshr_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsrl.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %v2 = lshr <8 x i32> %v0, %v1
++  store <8 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @lshr_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: lshr_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsrl.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %v2 = lshr <4 x i64> %v0, %v1
++  store <4 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @lshr_v32i8_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: lshr_v32i8_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrli.b $xr0, $xr0, 1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = lshr <32 x i8> %v0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
++  store <32 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @lshr_v32i8_7(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: lshr_v32i8_7:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrli.b $xr0, $xr0, 7
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = lshr <32 x i8> %v0, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
++  store <32 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @lshr_v16i16_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: lshr_v16i16_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrli.h $xr0, $xr0, 1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = lshr <16 x i16> %v0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
++  store <16 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @lshr_v16i16_15(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: lshr_v16i16_15:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrli.h $xr0, $xr0, 15
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = lshr <16 x i16> %v0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
++  store <16 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @lshr_v8i32_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: lshr_v8i32_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrli.w $xr0, $xr0, 1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = lshr <8 x i32> %v0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @lshr_v8i32_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: lshr_v8i32_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrli.w $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = lshr <8 x i32> %v0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @lshr_v4i64_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: lshr_v4i64_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrli.d $xr0, $xr0, 1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = lshr <4 x i64> %v0, <i64 1, i64 1, i64 1, i64 1>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
++
++define void @lshr_v4i64_63(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: lshr_v4i64_63:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrli.d $xr0, $xr0, 63
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = lshr <4 x i64> %v0, <i64 63, i64 63, i64 63, i64 63>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/mul.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/mul.ll
+new file mode 100644
+index 000000000000..dcb893caa255
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/mul.ll
+@@ -0,0 +1,238 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @mul_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mul_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvmul.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %v2 = mul <32 x i8> %v0, %v1
++  store <32 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @mul_v16i16(ptr %res, ptr %a0, ptr %a1)  nounwind {
++; CHECK-LABEL: mul_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvmul.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %v2 = mul <16 x i16> %v0, %v1
++  store <16 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @mul_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mul_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvmul.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %v2 = mul <8 x i32> %v0, %v1
++  store <8 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @mul_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mul_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvmul.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %v2 = mul <4 x i64> %v0, %v1
++  store <4 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @mul_square_v32i8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_square_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvmul.b $xr0, $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = mul <32 x i8> %v0, %v0
++  store <32 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @mul_square_v16i16(ptr %res, ptr %a0)  nounwind {
++; CHECK-LABEL: mul_square_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvmul.h $xr0, $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = mul <16 x i16> %v0, %v0
++  store <16 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @mul_square_v8i32(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_square_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvmul.w $xr0, $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = mul <8 x i32> %v0, %v0
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @mul_square_v4i64(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_square_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvmul.d $xr0, $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = mul <4 x i64> %v0, %v0
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
++
++define void @mul_v32i8_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_v32i8_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslli.b $xr0, $xr0, 3
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = mul <32 x i8> %v0, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
++  store <32 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @mul_v16i16_8(ptr %res, ptr %a0)  nounwind {
++; CHECK-LABEL: mul_v16i16_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslli.h $xr0, $xr0, 3
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = mul <16 x i16> %v0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
++  store <16 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @mul_v8i32_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_v8i32_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslli.w $xr0, $xr0, 3
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = mul <8 x i32> %v0, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @mul_v4i64_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_v4i64_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslli.d $xr0, $xr0, 3
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = mul <4 x i64> %v0, <i64 8, i64 8, i64 8, i64 8>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
++
++define void @mul_v32i8_17(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_v32i8_17:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvrepli.b $xr1, 17
++; CHECK-NEXT:    xvmul.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = mul <32 x i8> %v0, <i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17>
++  store <32 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @mul_v16i16_17(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_v16i16_17:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvrepli.h $xr1, 17
++; CHECK-NEXT:    xvmul.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = mul <16 x i16> %v0, <i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17>
++  store <16 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @mul_v8i32_17(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_v8i32_17:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvrepli.w $xr1, 17
++; CHECK-NEXT:    xvmul.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = mul <8 x i32> %v0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @mul_v4i64_17(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: mul_v4i64_17:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvrepli.d $xr1, 17
++; CHECK-NEXT:    xvmul.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = mul <4 x i64> %v0, <i64 17, i64 17, i64 17, i64 17>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sdiv.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sdiv.ll
+new file mode 100644
+index 000000000000..e3635a5f14a2
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sdiv.ll
+@@ -0,0 +1,134 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @sdiv_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: sdiv_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvdiv.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %v2 = sdiv <32 x i8> %v0, %v1
++  store <32 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @sdiv_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: sdiv_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvdiv.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %v2 = sdiv <16 x i16> %v0, %v1
++  store <16 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @sdiv_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: sdiv_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvdiv.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %v2 = sdiv <8 x i32> %v0, %v1
++  store <8 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @sdiv_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: sdiv_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvdiv.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %v2 = sdiv <4 x i64> %v0, %v1
++  store <4 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @sdiv_v32i8_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sdiv_v32i8_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrai.b $xr1, $xr0, 7
++; CHECK-NEXT:    xvsrli.b $xr1, $xr1, 5
++; CHECK-NEXT:    xvadd.b $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvsrai.b $xr0, $xr0, 3
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = sdiv <32 x i8> %v0, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
++  store <32 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @sdiv_v16i16_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sdiv_v16i16_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrai.h $xr1, $xr0, 15
++; CHECK-NEXT:    xvsrli.h $xr1, $xr1, 13
++; CHECK-NEXT:    xvadd.h $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvsrai.h $xr0, $xr0, 3
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = sdiv <16 x i16> %v0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
++  store <16 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @sdiv_v8i32_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sdiv_v8i32_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrai.w $xr1, $xr0, 31
++; CHECK-NEXT:    xvsrli.w $xr1, $xr1, 29
++; CHECK-NEXT:    xvadd.w $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvsrai.w $xr0, $xr0, 3
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = sdiv <8 x i32> %v0, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @sdiv_v4i64_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sdiv_v4i64_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrai.d $xr1, $xr0, 63
++; CHECK-NEXT:    xvsrli.d $xr1, $xr1, 61
++; CHECK-NEXT:    xvadd.d $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvsrai.d $xr0, $xr0, 3
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = sdiv <4 x i64> %v0, <i64 8, i64 8, i64 8, i64 8>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shl.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shl.ll
+new file mode 100644
+index 000000000000..8a02c7e3ac97
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shl.ll
+@@ -0,0 +1,178 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @shl_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: shl_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsll.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %v2 = shl <32 x i8> %v0, %v1
++  store <32 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @shl_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: shl_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsll.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %v2 = shl <16 x i16> %v0, %v1
++  store <16 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @shl_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: shl_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsll.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %v2 = shl <8 x i32> %v0, %v1
++  store <8 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @shl_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: shl_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsll.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %v2 = shl <4 x i64> %v0, %v1
++  store <4 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @shl_v32i8_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: shl_v32i8_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslli.b $xr0, $xr0, 1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = shl <32 x i8> %v0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
++  store <32 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @shl_v32i8_7(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: shl_v32i8_7:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslli.b $xr0, $xr0, 7
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = shl <32 x i8> %v0, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
++  store <32 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @shl_v16i16_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: shl_v16i16_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslli.h $xr0, $xr0, 1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = shl <16 x i16> %v0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
++  store <16 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @shl_v16i16_15(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: shl_v16i16_15:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslli.h $xr0, $xr0, 15
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = shl <16 x i16> %v0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
++  store <16 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @shl_v8i32_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: shl_v8i32_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslli.w $xr0, $xr0, 1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = shl <8 x i32> %v0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @shl_v8i32_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: shl_v8i32_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslli.w $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = shl <8 x i32> %v0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @shl_v4i64_1(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: shl_v4i64_1:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslli.d $xr0, $xr0, 1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = shl <4 x i64> %v0, <i64 1, i64 1, i64 1, i64 1>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
++
++define void @shl_v4i64_63(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: shl_v4i64_63:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslli.d $xr0, $xr0, 63
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = shl <4 x i64> %v0, <i64 63, i64 63, i64 63, i64 63>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sub.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sub.ll
+new file mode 100644
+index 000000000000..bcfff1651477
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sub.ll
+@@ -0,0 +1,122 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @sub_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: sub_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsub.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %v2 = sub <32 x i8> %v0, %v1
++  store <32 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @sub_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: sub_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsub.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %v2 = sub <16 x i16> %v0, %v1
++  store <16 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @sub_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: sub_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsub.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %v2 = sub <8 x i32> %v0, %v1
++  store <8 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @sub_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: sub_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsub.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %v2 = sub <4 x i64> %v0, %v1
++  store <4 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @sub_v32i8_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sub_v32i8_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsubi.bu $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = sub <32 x i8> %v0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
++  store <32 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @sub_v16i16_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sub_v16i16_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsubi.hu $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = sub <16 x i16> %v0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
++  store <16 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @sub_v8i32_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sub_v8i32_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsubi.wu $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = sub <8 x i32> %v0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @sub_v4i64_31(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sub_v4i64_31:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsubi.du $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = sub <4 x i64> %v0, <i64 31, i64 31, i64 31, i64 31>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/udiv.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/udiv.ll
+new file mode 100644
+index 000000000000..e78084c7186d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/udiv.ll
+@@ -0,0 +1,122 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @udiv_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: udiv_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvdiv.bu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %v2 = udiv <32 x i8> %v0, %v1
++  store <32 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @udiv_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: udiv_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvdiv.hu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %v2 = udiv <16 x i16> %v0, %v1
++  store <16 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @udiv_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: udiv_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvdiv.wu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %v2 = udiv <8 x i32> %v0, %v1
++  store <8 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @udiv_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: udiv_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvdiv.du $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %v2 = udiv <4 x i64> %v0, %v1
++  store <4 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @udiv_v32i8_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: udiv_v32i8_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrli.b $xr0, $xr0, 3
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = udiv <32 x i8> %v0, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
++  store <32 x i8> %v1, ptr %res
++  ret void
++}
++
++define void @udiv_v16i16_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: udiv_v16i16_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrli.h $xr0, $xr0, 3
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = udiv <16 x i16> %v0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
++  store <16 x i16> %v1, ptr %res
++  ret void
++}
++
++define void @udiv_v8i32_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: udiv_v8i32_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrli.w $xr0, $xr0, 3
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = udiv <8 x i32> %v0, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @udiv_v4i64_8(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: udiv_v4i64_8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvsrli.d $xr0, $xr0, 3
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = udiv <4 x i64> %v0, <i64 8, i64 8, i64 8, i64 8>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
+-- 
+2.20.1
+
+
+From 1b20d45ced302fa921b54294758687bc2c1df220 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Sat, 2 Dec 2023 14:25:17 +0800
+Subject: [PATCH 19/35] [LoongArch] Override TargetLowering::isShuffleMaskLegal
+
+ By default, `isShuffleMaskLegal` always returns true, which can result
+ in the expansion of `BUILD_VECTOR` into a `VECTOR_SHUFFLE` node in
+ certain situations. Subsequently, the `VECTOR_SHUFFLE` node is expanded
+ again into a `BUILD_VECTOR`, leading to an infinite loop.
+ To address this, we always return false, allowing the expansion of
+ `BUILD_VECTOR` through the stack.
+
+(cherry picked from commit 66a3e4fafb6eae19764f8a192ca3a116c0554211)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       | 10 +++++++++
+ .../Target/LoongArch/LoongArchISelLowering.h  |  5 +++++
+ .../CodeGen/LoongArch/lsx/build-vector.ll     | 22 +++++++++++++++++++
+ 3 files changed, 37 insertions(+)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index 1b60bfc3bddb..e45f21265d7b 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -239,6 +239,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+     }
+     for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
++      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
+       setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT,
+                          Legal);
+@@ -268,6 +269,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+     }
+     for (MVT VT : {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8}) {
++      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
+       setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT,
+                          Legal);
+@@ -370,10 +372,18 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
+     return lowerINSERT_VECTOR_ELT(Op, DAG);
+   case ISD::BUILD_VECTOR:
+     return lowerBUILD_VECTOR(Op, DAG);
++  case ISD::VECTOR_SHUFFLE:
++    return lowerVECTOR_SHUFFLE(Op, DAG);
+   }
+   return SDValue();
+ }
+ 
++SDValue LoongArchTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
++                                                     SelectionDAG &DAG) const {
++  // TODO: custom shuffle.
++  return SDValue();
++}
++
+ static bool isConstantOrUndef(const SDValue Op) {
+   if (Op->isUndef())
+     return true;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+index 111376306374..2c35f9e5d378 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+@@ -230,6 +230,10 @@ public:
+       MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+       unsigned *Fast = nullptr) const override;
+ 
++  bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override {
++    return false;
++  }
++
+ private:
+   /// Target-specific function used to lower LoongArch calling conventions.
+   typedef bool LoongArchCCAssignFn(const DataLayout &DL, LoongArchABI::ABI ABI,
+@@ -277,6 +281,7 @@ private:
+   SDValue lowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
++  SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+ 
+   bool isFPImmLegal(const APFloat &Imm, EVT VT,
+                     bool ForCodeSize) const override;
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+index 3a74db5e1acb..ed1f610a5fa6 100644
+--- a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
++++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll
+@@ -374,3 +374,25 @@ entry:
+   store <2 x double> %ins1, ptr %dst
+   ret void
+ }
++
++;; BUILD_VECTOR through stack.
++;; If `isShuffleMaskLegal` returns true, it will lead to an infinite loop.
++define void @extract1_i32_zext_insert0_i64_undef(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    addi.d $sp, $sp, -16
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 1
++; CHECK-NEXT:    bstrpick.d $a0, $a0, 31, 0
++; CHECK-NEXT:    st.d $a0, $sp, 0
++; CHECK-NEXT:    vld $vr0, $sp, 0
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $sp, 16
++; CHECK-NEXT:    ret
++  %v = load volatile <4 x i32>, ptr %src
++  %e = extractelement <4 x i32> %v, i32 1
++  %z = zext i32 %e to i64
++  %r = insertelement <2 x i64> undef, i64 %z, i32 0
++  store <2 x i64> %r, ptr %dst
++  ret void
++}
+-- 
+2.20.1
+
+
+From aa1ff5f878a37004975a017d84b2e87df0ea8235 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Sat, 2 Dec 2023 16:24:33 +0800
+Subject: [PATCH 20/35] Reland "[LoongArch] Support CTLZ with lsx/lasx"
+
+This patch simultaneously adds tests for `CTPOP`.
+
+This relands 07cec73dcd095035257eec1f213d273b10988130 with fix tests.
+
+(cherry picked from commit a60a5421b60be1bce0272385fa16846ada5eed5e)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       |  13 +-
+ .../LoongArch/LoongArchLASXInstrInfo.td       |  11 +-
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td |  11 +-
+ .../test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll | 115 ++++++++++++++++++
+ llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll | 115 ++++++++++++++++++
+ 5 files changed, 255 insertions(+), 10 deletions(-)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index e45f21265d7b..358263b1a258 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -247,7 +247,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+                          VT, Legal);
+       setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, VT, Legal);
+       setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL}, VT, Legal);
+-      setOperationAction(ISD::CTPOP, VT, Legal);
++      setOperationAction({ISD::CTPOP, ISD::CTLZ}, VT, Legal);
+     }
+     for (MVT VT : {MVT::v4f32, MVT::v2f64}) {
+       setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
+@@ -277,7 +277,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+                          VT, Legal);
+       setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, VT, Legal);
+       setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL}, VT, Legal);
+-      setOperationAction(ISD::CTPOP, VT, Legal);
++      setOperationAction({ISD::CTPOP, ISD::CTLZ}, VT, Legal);
+     }
+     for (MVT VT : {MVT::v8f32, MVT::v4f64}) {
+       setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
+@@ -2800,6 +2800,15 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
+   case Intrinsic::loongarch_lasx_xvsrai_d:
+     return DAG.getNode(ISD::SRA, DL, N->getValueType(0), N->getOperand(1),
+                        lowerVectorSplatImm<6>(N, 2, DAG));
++  case Intrinsic::loongarch_lsx_vclz_b:
++  case Intrinsic::loongarch_lsx_vclz_h:
++  case Intrinsic::loongarch_lsx_vclz_w:
++  case Intrinsic::loongarch_lsx_vclz_d:
++  case Intrinsic::loongarch_lasx_xvclz_b:
++  case Intrinsic::loongarch_lasx_xvclz_h:
++  case Intrinsic::loongarch_lasx_xvclz_w:
++  case Intrinsic::loongarch_lasx_xvclz_d:
++    return DAG.getNode(ISD::CTLZ, DL, N->getValueType(0), N->getOperand(1));
+   case Intrinsic::loongarch_lsx_vpcnt_b:
+   case Intrinsic::loongarch_lsx_vpcnt_h:
+   case Intrinsic::loongarch_lsx_vpcnt_w:
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index a5652472481a..960ac627578c 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -1273,6 +1273,9 @@ defm : PatXrXr<sra, "XVSRA">;
+ defm : PatShiftXrXr<sra, "XVSRA">;
+ defm : PatShiftXrUimm<sra, "XVSRAI">;
+ 
++// XVCLZ_{B/H/W/D}
++defm : PatXr<ctlz, "XVCLZ">;
++
+ // XVPCNT_{B/H/W/D}
+ defm : PatXr<ctpop, "XVPCNT">;
+ 
+@@ -1590,26 +1593,26 @@ foreach Inst = ["XVMADDWEV_Q_D", "XVMADDWOD_Q_D", "XVMADDWEV_Q_DU",
+ //     (LAInst vty:$xj)>;
+ foreach Inst = ["XVEXTH_H_B", "XVEXTH_HU_BU",
+                 "XVMSKLTZ_B", "XVMSKGEZ_B", "XVMSKNZ_B",
+-                "XVCLO_B", "XVCLZ_B", "VEXT2XV_H_B", "VEXT2XV_HU_BU",
++                "XVCLO_B", "VEXT2XV_H_B", "VEXT2XV_HU_BU",
+                 "VEXT2XV_W_B", "VEXT2XV_WU_BU", "VEXT2XV_D_B",
+                 "VEXT2XV_DU_BU", "XVREPLVE0_B", "XVREPLVE0_Q"] in
+   def : Pat<(deriveLASXIntrinsic<Inst>.ret (v32i8 LASX256:$xj)),
+             (!cast<LAInst>(Inst) LASX256:$xj)>;
+ foreach Inst = ["XVEXTH_W_H", "XVEXTH_WU_HU", "XVMSKLTZ_H",
+-                "XVCLO_H", "XVCLZ_H", "XVFCVTL_S_H", "XVFCVTH_S_H",
++                "XVCLO_H", "XVFCVTL_S_H", "XVFCVTH_S_H",
+                 "VEXT2XV_W_H", "VEXT2XV_WU_HU", "VEXT2XV_D_H",
+                 "VEXT2XV_DU_HU", "XVREPLVE0_H"] in
+   def : Pat<(deriveLASXIntrinsic<Inst>.ret (v16i16 LASX256:$xj)),
+             (!cast<LAInst>(Inst) LASX256:$xj)>;
+ foreach Inst = ["XVEXTH_D_W", "XVEXTH_DU_WU", "XVMSKLTZ_W",
+-                "XVCLO_W", "XVCLZ_W", "XVFFINT_S_W", "XVFFINT_S_WU",
++                "XVCLO_W", "XVFFINT_S_W", "XVFFINT_S_WU",
+                 "XVFFINTL_D_W", "XVFFINTH_D_W",
+                 "VEXT2XV_D_W", "VEXT2XV_DU_WU", "XVREPLVE0_W"] in
+   def : Pat<(deriveLASXIntrinsic<Inst>.ret (v8i32 LASX256:$xj)),
+             (!cast<LAInst>(Inst) LASX256:$xj)>;
+ foreach Inst = ["XVEXTH_Q_D", "XVEXTH_QU_DU", "XVMSKLTZ_D",
+                 "XVEXTL_Q_D", "XVEXTL_QU_DU",
+-                "XVCLO_D", "XVCLZ_D", "XVFFINT_D_L", "XVFFINT_D_LU",
++                "XVCLO_D", "XVFFINT_D_L", "XVFFINT_D_LU",
+                 "XVREPLVE0_D"] in
+   def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4i64 LASX256:$xj)),
+             (!cast<LAInst>(Inst) LASX256:$xj)>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index 5645ce51194a..3480ade9eebf 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -1350,6 +1350,9 @@ defm : PatVrVr<sra, "VSRA">;
+ defm : PatShiftVrVr<sra, "VSRA">;
+ defm : PatShiftVrUimm<sra, "VSRAI">;
+ 
++// VCLZ_{B/H/W/D}
++defm : PatVr<ctlz, "VCLZ">;
++
+ // VPCNT_{B/H/W/D}
+ defm : PatVr<ctpop, "VPCNT">;
+ 
+@@ -1674,21 +1677,21 @@ foreach Inst = ["VMADDWEV_Q_D", "VMADDWOD_Q_D", "VMADDWEV_Q_DU",
+ //     (LAInst vty:$vj)>;
+ foreach Inst = ["VEXTH_H_B", "VEXTH_HU_BU",
+                 "VMSKLTZ_B", "VMSKGEZ_B", "VMSKNZ_B",
+-                "VCLO_B", "VCLZ_B"] in
++                "VCLO_B"] in
+   def : Pat<(deriveLSXIntrinsic<Inst>.ret (v16i8 LSX128:$vj)),
+             (!cast<LAInst>(Inst) LSX128:$vj)>;
+ foreach Inst = ["VEXTH_W_H", "VEXTH_WU_HU", "VMSKLTZ_H",
+-                "VCLO_H", "VCLZ_H", "VFCVTL_S_H", "VFCVTH_S_H"] in
++                "VCLO_H", "VFCVTL_S_H", "VFCVTH_S_H"] in
+   def : Pat<(deriveLSXIntrinsic<Inst>.ret (v8i16 LSX128:$vj)),
+             (!cast<LAInst>(Inst) LSX128:$vj)>;
+ foreach Inst = ["VEXTH_D_W", "VEXTH_DU_WU", "VMSKLTZ_W",
+-                "VCLO_W", "VCLZ_W", "VFFINT_S_W", "VFFINT_S_WU",
++                "VCLO_W", "VFFINT_S_W", "VFFINT_S_WU",
+                 "VFFINTL_D_W", "VFFINTH_D_W"] in
+   def : Pat<(deriveLSXIntrinsic<Inst>.ret (v4i32 LSX128:$vj)),
+             (!cast<LAInst>(Inst) LSX128:$vj)>;
+ foreach Inst = ["VEXTH_Q_D", "VEXTH_QU_DU", "VMSKLTZ_D",
+                 "VEXTL_Q_D", "VEXTL_QU_DU",
+-                "VCLO_D", "VCLZ_D", "VFFINT_D_L", "VFFINT_D_LU"] in
++                "VCLO_D", "VFFINT_D_L", "VFFINT_D_LU"] in
+   def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2i64 LSX128:$vj)),
+             (!cast<LAInst>(Inst) LSX128:$vj)>;
+ 
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
+new file mode 100644
+index 000000000000..7786e399c95f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
+@@ -0,0 +1,115 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @ctpop_v32i8(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: ctpop_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvpcnt.b $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load <32 x i8>, ptr %src
++  %res = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %v)
++  store <32 x i8> %res, ptr %dst
++  ret void
++}
++
++define void @ctpop_v16i16(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: ctpop_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvpcnt.h $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load <16 x i16>, ptr %src
++  %res = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %v)
++  store <16 x i16> %res, ptr %dst
++  ret void
++}
++
++define void @ctpop_v8i32(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: ctpop_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvpcnt.w $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load <8 x i32>, ptr %src
++  %res = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %v)
++  store <8 x i32> %res, ptr %dst
++  ret void
++}
++
++define void @ctpop_v4i64(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: ctpop_v4i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvpcnt.d $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load <4 x i64>, ptr %src
++  %res = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %v)
++  store <4 x i64> %res, ptr %dst
++  ret void
++}
++
++define void @ctlz_v32i8(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: ctlz_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvclz.b $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load <32 x i8>, ptr %src
++  %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %v, i1 false)
++  store <32 x i8> %res, ptr %dst
++  ret void
++}
++
++define void @ctlz_v16i16(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: ctlz_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvclz.h $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load <16 x i16>, ptr %src
++  %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %v, i1 false)
++  store <16 x i16> %res, ptr %dst
++  ret void
++}
++
++define void @ctlz_v8i32(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: ctlz_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvclz.w $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load <8 x i32>, ptr %src
++  %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %v, i1 false)
++  store <8 x i32> %res, ptr %dst
++  ret void
++}
++
++define void @ctlz_v4i64(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: ctlz_v4i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvclz.d $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load <4 x i64>, ptr %src
++  %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %v, i1 false)
++  store <4 x i64> %res, ptr %dst
++  ret void
++}
++
++declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>)
++declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>)
++declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
++declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>)
++declare <32 x i8> @llvm.ctlz.v32i8(<32 x i8>, i1)
++declare <16 x i16> @llvm.ctlz.v16i16(<16 x i16>, i1)
++declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1)
++declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1)
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
+new file mode 100644
+index 000000000000..5df553fba7ef
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
+@@ -0,0 +1,115 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @ctpop_v16i8(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: ctpop_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vpcnt.b $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load <16 x i8>, ptr %src
++  %res = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %v)
++  store <16 x i8> %res, ptr %dst
++  ret void
++}
++
++define void @ctpop_v8i16(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: ctpop_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vpcnt.h $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load <8 x i16>, ptr %src
++  %res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %v)
++  store <8 x i16> %res, ptr %dst
++  ret void
++}
++
++define void @ctpop_v4i32(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: ctpop_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vpcnt.w $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load <4 x i32>, ptr %src
++  %res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %v)
++  store <4 x i32> %res, ptr %dst
++  ret void
++}
++
++define void @ctpop_v2i64(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: ctpop_v2i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vpcnt.d $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load <2 x i64>, ptr %src
++  %res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %v)
++  store <2 x i64> %res, ptr %dst
++  ret void
++}
++
++define void @ctlz_v16i8(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: ctlz_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vclz.b $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load <16 x i8>, ptr %src
++  %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %v, i1 false)
++  store <16 x i8> %res, ptr %dst
++  ret void
++}
++
++define void @ctlz_v8i16(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: ctlz_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vclz.h $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load <8 x i16>, ptr %src
++  %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %v, i1 false)
++  store <8 x i16> %res, ptr %dst
++  ret void
++}
++
++define void @ctlz_v4i32(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: ctlz_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vclz.w $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load <4 x i32>, ptr %src
++  %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %v, i1 false)
++  store <4 x i32> %res, ptr %dst
++  ret void
++}
++
++define void @ctlz_v2i64(ptr %src, ptr %dst) nounwind {
++; CHECK-LABEL: ctlz_v2i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a0, 0
++; CHECK-NEXT:    vclz.d $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a1, 0
++; CHECK-NEXT:    ret
++  %v = load <2 x i64>, ptr %src
++  %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %v, i1 false)
++  store <2 x i64> %res, ptr %dst
++  ret void
++}
++
++declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
++declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
++declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
++declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
++declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1)
++declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1)
++declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
++declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1)
+-- 
+2.20.1
+
+
+From aa55afe1a5c74c325f009c58f48645107fd95e11 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Mon, 4 Dec 2023 10:44:39 +0800
+Subject: [PATCH 21/35] [LoongArch] Support MULHS/MULHU with lsx/lasx
+
+Mark MULHS/MULHU nodes as legal and adds the necessary patterns.
+
+(cherry picked from commit e9cd197d15300f186a5a32092103add65fbd3f50)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       |   2 +
+ .../LoongArch/LoongArchLASXInstrInfo.td       |   4 +
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td |   4 +
+ llvm/test/CodeGen/LoongArch/lasx/mulh.ll      | 162 ++++++++++++++++++
+ llvm/test/CodeGen/LoongArch/lsx/mulh.ll       | 162 ++++++++++++++++++
+ 5 files changed, 334 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/mulh.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/mulh.ll
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index 358263b1a258..3d8d6898a4d5 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -248,6 +248,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, VT, Legal);
+       setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL}, VT, Legal);
+       setOperationAction({ISD::CTPOP, ISD::CTLZ}, VT, Legal);
++      setOperationAction({ISD::MULHS, ISD::MULHU}, VT, Legal);
+     }
+     for (MVT VT : {MVT::v4f32, MVT::v2f64}) {
+       setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
+@@ -278,6 +279,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, VT, Legal);
+       setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL}, VT, Legal);
+       setOperationAction({ISD::CTPOP, ISD::CTLZ}, VT, Legal);
++      setOperationAction({ISD::MULHS, ISD::MULHU}, VT, Legal);
+     }
+     for (MVT VT : {MVT::v8f32, MVT::v4f64}) {
+       setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index 960ac627578c..240f28b0dc5a 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -1217,6 +1217,10 @@ defm : PatXrUimm5<umin, "XVMINI">;
+ // XVMUL_{B/H/W/D}
+ defm : PatXrXr<mul, "XVMUL">;
+ 
++// XVMUH_{B/H/W/D}[U]
++defm : PatXrXr<mulhs, "XVMUH">;
++defm : PatXrXrU<mulhu, "XVMUH">;
++
+ // XVMADD_{B/H/W/D}
+ defm : PatXrXrXr<muladd, "XVMADD">;
+ // XVMSUB_{B/H/W/D}
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index 3480ade9eebf..fb4726c530b5 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -1294,6 +1294,10 @@ defm : PatVrUimm5<umin, "VMINI">;
+ // VMUL_{B/H/W/D}
+ defm : PatVrVr<mul, "VMUL">;
+ 
++// VMUH_{B/H/W/D}[U]
++defm : PatVrVr<mulhs, "VMUH">;
++defm : PatVrVrU<mulhu, "VMUH">;
++
+ // VMADD_{B/H/W/D}
+ defm : PatVrVrVr<muladd, "VMADD">;
+ // VMSUB_{B/H/W/D}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/mulh.ll b/llvm/test/CodeGen/LoongArch/lasx/mulh.ll
+new file mode 100644
+index 000000000000..aac711a4a371
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/mulh.ll
+@@ -0,0 +1,162 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @mulhs_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mulhs_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvmuh.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %v0s = sext <32 x i8> %v0 to <32 x i16>
++  %v1s = sext <32 x i8> %v1 to <32 x i16>
++  %m = mul <32 x i16> %v0s, %v1s
++  %s = ashr <32 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
++  %v2 = trunc <32 x i16> %s to <32 x i8>
++  store <32 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @mulhu_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mulhu_v32i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvmuh.bu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %v0z = zext <32 x i8> %v0 to <32 x i16>
++  %v1z = zext <32 x i8> %v1 to <32 x i16>
++  %m = mul <32 x i16> %v0z, %v1z
++  %s = lshr <32 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
++  %v2 = trunc <32 x i16> %s to <32 x i8>
++  store <32 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @mulhs_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mulhs_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvmuh.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %v0s = sext <16 x i16> %v0 to <16 x i32>
++  %v1s = sext <16 x i16> %v1 to <16 x i32>
++  %m = mul <16 x i32> %v0s, %v1s
++  %s = ashr <16 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
++  %v2 = trunc <16 x i32> %s to <16 x i16>
++  store <16 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @mulhu_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mulhu_v16i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvmuh.hu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %v0z = zext <16 x i16> %v0 to <16 x i32>
++  %v1z = zext <16 x i16> %v1 to <16 x i32>
++  %m = mul <16 x i32> %v0z, %v1z
++  %s = lshr <16 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
++  %v2 = trunc <16 x i32> %s to <16 x i16>
++  store <16 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @mulhs_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mulhs_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvmuh.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %v0s = sext <8 x i32> %v0 to <8 x i64>
++  %v1s = sext <8 x i32> %v1 to <8 x i64>
++  %m = mul <8 x i64> %v0s, %v1s
++  %s = ashr <8 x i64> %m, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
++  %v2 = trunc <8 x i64> %s to <8 x i32>
++  store <8 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @mulhu_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mulhu_v8i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvmuh.wu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %v0z = zext <8 x i32> %v0 to <8 x i64>
++  %v1z = zext <8 x i32> %v1 to <8 x i64>
++  %m = mul <8 x i64> %v0z, %v1z
++  %s = lshr <8 x i64> %m, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
++  %v2 = trunc <8 x i64> %s to <8 x i32>
++  store <8 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @mulhs_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mulhs_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvmuh.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %v0s = sext <4 x i64> %v0 to <4 x i128>
++  %v1s = sext <4 x i64> %v1 to <4 x i128>
++  %m = mul <4 x i128> %v0s, %v1s
++  %s = ashr <4 x i128> %m, <i128 64, i128 64, i128 64, i128 64>
++  %v2 = trunc <4 x i128> %s to <4 x i64>
++  store <4 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @mulhu_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mulhu_v4i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvmuh.du $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %v0z = zext <4 x i64> %v0 to <4 x i128>
++  %v1z = zext <4 x i64> %v1 to <4 x i128>
++  %m = mul <4 x i128> %v0z, %v1z
++  %s = lshr <4 x i128> %m, <i128 64, i128 64, i128 64, i128 64>
++  %v2 = trunc <4 x i128> %s to <4 x i64>
++  store <4 x i64> %v2, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/mulh.ll b/llvm/test/CodeGen/LoongArch/lsx/mulh.ll
+new file mode 100644
+index 000000000000..e1388f00e355
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/mulh.ll
+@@ -0,0 +1,162 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @mulhs_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mulhs_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vmuh.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %v0s = sext <16 x i8> %v0 to <16 x i16>
++  %v1s = sext <16 x i8> %v1 to <16 x i16>
++  %m = mul <16 x i16> %v0s, %v1s
++  %s = ashr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
++  %v2 = trunc <16 x i16> %s to <16 x i8>
++  store <16 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @mulhu_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mulhu_v16i8:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vmuh.bu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %v0z = zext <16 x i8> %v0 to <16 x i16>
++  %v1z = zext <16 x i8> %v1 to <16 x i16>
++  %m = mul <16 x i16> %v0z, %v1z
++  %s = lshr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
++  %v2 = trunc <16 x i16> %s to <16 x i8>
++  store <16 x i8> %v2, ptr %res
++  ret void
++}
++
++define void @mulhs_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mulhs_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vmuh.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %v0s = sext <8 x i16> %v0 to <8 x i32>
++  %v1s = sext <8 x i16> %v1 to <8 x i32>
++  %m = mul <8 x i32> %v0s, %v1s
++  %s = ashr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
++  %v2 = trunc <8 x i32> %s to <8 x i16>
++  store <8 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @mulhu_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mulhu_v8i16:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vmuh.hu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %v0z = zext <8 x i16> %v0 to <8 x i32>
++  %v1z = zext <8 x i16> %v1 to <8 x i32>
++  %m = mul <8 x i32> %v0z, %v1z
++  %s = lshr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
++  %v2 = trunc <8 x i32> %s to <8 x i16>
++  store <8 x i16> %v2, ptr %res
++  ret void
++}
++
++define void @mulhs_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mulhs_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vmuh.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %v0s = sext <4 x i32> %v0 to <4 x i64>
++  %v1s = sext <4 x i32> %v1 to <4 x i64>
++  %m = mul <4 x i64> %v0s, %v1s
++  %s = ashr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
++  %v2 = trunc <4 x i64> %s to <4 x i32>
++  store <4 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @mulhu_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mulhu_v4i32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vmuh.wu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %v0z = zext <4 x i32> %v0 to <4 x i64>
++  %v1z = zext <4 x i32> %v1 to <4 x i64>
++  %m = mul <4 x i64> %v0z, %v1z
++  %s = lshr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
++  %v2 = trunc <4 x i64> %s to <4 x i32>
++  store <4 x i32> %v2, ptr %res
++  ret void
++}
++
++define void @mulhs_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mulhs_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vmuh.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %v0s = sext <2 x i64> %v0 to <2 x i128>
++  %v1s = sext <2 x i64> %v1 to <2 x i128>
++  %m = mul <2 x i128> %v0s, %v1s
++  %s = ashr <2 x i128> %m, <i128 64, i128 64>
++  %v2 = trunc <2 x i128> %s to <2 x i64>
++  store <2 x i64> %v2, ptr %res
++  ret void
++}
++
++define void @mulhu_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: mulhu_v2i64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vmuh.du $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %v0z = zext <2 x i64> %v0 to <2 x i128>
++  %v1z = zext <2 x i64> %v1 to <2 x i128>
++  %m = mul <2 x i128> %v0z, %v1z
++  %s = lshr <2 x i128> %m, <i128 64, i128 64>
++  %v2 = trunc <2 x i128> %s to <2 x i64>
++  store <2 x i64> %v2, ptr %res
++  ret void
++}
+-- 
+2.20.1
+
+
+From 7d2d996fdab4fa9279318174f5b8042cc7ace0a6 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Wed, 6 Dec 2023 16:43:38 +0800
+Subject: [PATCH 22/35] [LoongArch] Make ISD::VSELECT a legal operation with
+ lsx/lasx
+
+(cherry picked from commit de21308f78f3b0f0910638dbdac90967150d19f0)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       |  5 ++
+ .../LoongArch/LoongArchLASXInstrInfo.td       |  8 ++
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td |  8 ++
+ llvm/test/CodeGen/LoongArch/lasx/vselect.ll   | 86 +++++++++++++++++++
+ llvm/test/CodeGen/LoongArch/lsx/vselect.ll    | 86 +++++++++++++++++++
+ 5 files changed, 193 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/vselect.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/vselect.ll
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index 3d8d6898a4d5..229251987ae4 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -237,6 +237,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
+       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
++
++      setOperationAction(ISD::VSELECT, VT, Legal);
+     }
+     for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+       setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+@@ -268,6 +270,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
+       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
++
++      setOperationAction(ISD::VSELECT, VT, Legal);
+     }
+     for (MVT VT : {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8}) {
+       setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+@@ -305,6 +309,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+   setStackPointerRegisterToSaveRestore(LoongArch::R3);
+ 
+   setBooleanContents(ZeroOrOneBooleanContent);
++  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+ 
+   setMaxAtomicSizeInBitsSupported(Subtarget.getGRLen());
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index 240f28b0dc5a..0bd8db1bfdf0 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -1480,6 +1480,14 @@ def : Pat<(f32 (vector_extract v8f32:$xj, i64:$rk)),
+ def : Pat<(f64 (vector_extract v4f64:$xj, i64:$rk)),
+           (f64 (EXTRACT_SUBREG (XVREPLVE_D v4f64:$xj, i64:$rk), sub_64))>;
+ 
++// vselect
++def : Pat<(v32i8 (vselect LASX256:$xj, LASX256:$xd,
++                          (v32i8 (SplatPat_uimm8 uimm8:$imm)))),
++          (XVBITSELI_B LASX256:$xd, LASX256:$xj, uimm8:$imm)>;
++foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in
++  def  : Pat<(vt (vselect LASX256:$xa, LASX256:$xk, LASX256:$xj)),
++             (XVBITSEL_V LASX256:$xj, LASX256:$xk, LASX256:$xa)>;
++
+ } // Predicates = [HasExtLASX]
+ 
+ /// Intrinsic pattern
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index fb4726c530b5..5800ff6f6266 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -1564,6 +1564,14 @@ def : Pat<(f32 (vector_extract v4f32:$vj, i64:$rk)),
+ def : Pat<(f64 (vector_extract v2f64:$vj, i64:$rk)),
+           (f64 (EXTRACT_SUBREG (VREPLVE_D v2f64:$vj, i64:$rk), sub_64))>;
+ 
++// vselect
++def : Pat<(v16i8 (vselect LSX128:$vj, LSX128:$vd,
++                          (v16i8 (SplatPat_uimm8 uimm8:$imm)))),
++          (VBITSELI_B LSX128:$vd, LSX128:$vj, uimm8:$imm)>;
++foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in
++  def  : Pat<(vt (vselect LSX128:$va, LSX128:$vk, LSX128:$vj)),
++             (VBITSEL_V LSX128:$vj, LSX128:$vk, LSX128:$va)>;
++
+ } // Predicates = [HasExtLSX]
+ 
+ /// Intrinsic pattern
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/vselect.ll b/llvm/test/CodeGen/LoongArch/lasx/vselect.ll
+new file mode 100644
+index 000000000000..24f4bcf752d3
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/vselect.ll
+@@ -0,0 +1,86 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @select_v32i8_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: select_v32i8_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvrepli.h $xr1, -256
++; CHECK-NEXT:    xvbitseli.b $xr0, $xr1, 1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <32 x i8>, ptr %a0
++  %sel = select <32 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <32 x i8> %v0, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
++  store <32 x i8> %sel, ptr %res
++  ret void
++}
++
++define void @select_v32i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: select_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvrepli.h $xr2, -256
++; CHECK-NEXT:    xvbitsel.v $xr0, $xr1, $xr0, $xr2
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %sel = select <32 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <32 x i8> %v0, <32 x i8> %v1
++  store <32 x i8> %sel, ptr %res
++  ret void
++}
++
++define void @select_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: select_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    lu12i.w $a1, -16
++; CHECK-NEXT:    lu32i.d $a1, 0
++; CHECK-NEXT:    xvreplgr2vr.w $xr2, $a1
++; CHECK-NEXT:    xvbitsel.v $xr0, $xr1, $xr0, $xr2
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %sel = select <16 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <16 x i16> %v0, <16 x i16> %v1
++  store <16 x i16> %sel, ptr %res
++  ret void
++}
++
++define void @select_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: select_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    ori $a1, $zero, 0
++; CHECK-NEXT:    lu32i.d $a1, -1
++; CHECK-NEXT:    xvreplgr2vr.d $xr2, $a1
++; CHECK-NEXT:    xvbitsel.v $xr0, $xr1, $xr0, $xr2
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %sel = select <8 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <8 x i32> %v0, <8 x i32> %v1
++  store <8 x i32> %sel, ptr %res
++  ret void
++}
++
++define void @select_v4i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: select_v4i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a3, %pc_hi20(.LCPI4_0)
++; CHECK-NEXT:    addi.d $a3, $a3, %pc_lo12(.LCPI4_0)
++; CHECK-NEXT:    xvld $xr0, $a3, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvld $xr2, $a2, 0
++; CHECK-NEXT:    xvbitsel.v $xr0, $xr2, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %sel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i64> %v0, <4 x i64> %v1
++  store <4 x i64> %sel, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/vselect.ll b/llvm/test/CodeGen/LoongArch/lsx/vselect.ll
+new file mode 100644
+index 000000000000..00e3d9313f13
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/vselect.ll
+@@ -0,0 +1,86 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @select_v16i8_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: select_v16i8_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vrepli.h $vr1, -256
++; CHECK-NEXT:    vbitseli.b $vr0, $vr1, 255
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i8>, ptr %a0
++  %sel = select <16 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <16 x i8> %v0, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
++  store <16 x i8> %sel, ptr %res
++  ret void
++}
++
++define void @select_v16i8(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: select_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vrepli.h $vr2, -256
++; CHECK-NEXT:    vbitsel.v $vr0, $vr1, $vr0, $vr2
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %sel = select <16 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <16 x i8> %v0, <16 x i8> %v1
++  store <16 x i8> %sel, ptr %res
++  ret void
++}
++
++define void @select_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: select_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    lu12i.w $a1, -16
++; CHECK-NEXT:    lu32i.d $a1, 0
++; CHECK-NEXT:    vreplgr2vr.w $vr2, $a1
++; CHECK-NEXT:    vbitsel.v $vr0, $vr1, $vr0, $vr2
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %sel = select <8 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <8 x i16> %v0, <8 x i16> %v1
++  store <8 x i16> %sel, ptr %res
++  ret void
++}
++
++define void @select_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: select_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    ori $a1, $zero, 0
++; CHECK-NEXT:    lu32i.d $a1, -1
++; CHECK-NEXT:    vreplgr2vr.d $vr2, $a1
++; CHECK-NEXT:    vbitsel.v $vr0, $vr1, $vr0, $vr2
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %sel = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>, <4 x i32> %v0, <4 x i32> %v1
++  store <4 x i32> %sel, ptr %res
++  ret void
++}
++
++define void @select_v2i64(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: select_v2i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a3, %pc_hi20(.LCPI4_0)
++; CHECK-NEXT:    addi.d $a3, $a3, %pc_lo12(.LCPI4_0)
++; CHECK-NEXT:    vld $vr0, $a3, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vld $vr2, $a2, 0
++; CHECK-NEXT:    vbitsel.v $vr0, $vr2, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %sel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %v0, <2 x i64> %v1
++  store <2 x i64> %sel, ptr %res
++  ret void
++}
+-- 
+2.20.1
+
+
+From 051e8cc8c17b13c4cb5ccd81038a305580fe3228 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Thu, 7 Dec 2023 20:11:43 +0800
+Subject: [PATCH 23/35] [LoongArch] Add codegen support for icmp/fcmp with
+ lsx/lasx fetaures (#74700)
+
+Mark ISD::SETCC node as legal, and add handling for the vector types
+condition codes.
+
+(cherry picked from commit 9ff7d0ebeb54347f9006405a6d08ed2b713bc411)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       |  14 +
+ .../LoongArch/LoongArchLASXInstrInfo.td       |  95 ++
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td |  95 ++
+ .../LoongArch/lasx/ir-instruction/fcmp.ll     | 692 +++++++++++++
+ .../LoongArch/lasx/ir-instruction/icmp.ll     | 939 ++++++++++++++++++
+ .../LoongArch/lsx/ir-instruction/fcmp.ll      | 692 +++++++++++++
+ .../LoongArch/lsx/ir-instruction/icmp.ll      | 939 ++++++++++++++++++
+ 7 files changed, 3466 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fcmp.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/icmp.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fcmp.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/icmp.ll
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index 229251987ae4..3d5ae6d3deda 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -238,6 +238,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
+       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ 
++      setOperationAction(ISD::SETCC, VT, Legal);
+       setOperationAction(ISD::VSELECT, VT, Legal);
+     }
+     for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+@@ -251,11 +252,17 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL}, VT, Legal);
+       setOperationAction({ISD::CTPOP, ISD::CTLZ}, VT, Legal);
+       setOperationAction({ISD::MULHS, ISD::MULHU}, VT, Legal);
++      setCondCodeAction(
++          {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
++          Expand);
+     }
+     for (MVT VT : {MVT::v4f32, MVT::v2f64}) {
+       setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
+       setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal);
+       setOperationAction(ISD::FMA, VT, Legal);
++      setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT,
++                         ISD::SETUGE, ISD::SETUGT},
++                        VT, Expand);
+     }
+   }
+ 
+@@ -271,6 +278,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
+       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ 
++      setOperationAction(ISD::SETCC, VT, Legal);
+       setOperationAction(ISD::VSELECT, VT, Legal);
+     }
+     for (MVT VT : {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8}) {
+@@ -284,11 +292,17 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL}, VT, Legal);
+       setOperationAction({ISD::CTPOP, ISD::CTLZ}, VT, Legal);
+       setOperationAction({ISD::MULHS, ISD::MULHU}, VT, Legal);
++      setCondCodeAction(
++          {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
++          Expand);
+     }
+     for (MVT VT : {MVT::v8f32, MVT::v4f64}) {
+       setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
+       setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal);
+       setOperationAction(ISD::FMA, VT, Legal);
++      setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT,
++                         ISD::SETUGE, ISD::SETUGT},
++                        VT, Expand);
+     }
+   }
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index 0bd8db1bfdf0..a9bf65c6840d 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -1184,6 +1184,65 @@ multiclass PatShiftXrUimm<SDPatternOperator OpNode, string Inst> {
+             (!cast<LAInst>(Inst#"_D") LASX256:$xj, uimm6:$imm)>;
+ }
+ 
++multiclass PatCCXrSimm5<CondCode CC, string Inst> {
++  def : Pat<(v32i8 (setcc (v32i8 LASX256:$xj),
++                          (v32i8 (SplatPat_simm5 simm5:$imm)), CC)),
++            (!cast<LAInst>(Inst#"_B") LASX256:$xj, simm5:$imm)>;
++  def : Pat<(v16i16 (setcc (v16i16 LASX256:$xj),
++                           (v16i16 (SplatPat_simm5 simm5:$imm)), CC)),
++            (!cast<LAInst>(Inst#"_H") LASX256:$xj, simm5:$imm)>;
++  def : Pat<(v8i32 (setcc (v8i32 LASX256:$xj),
++                          (v8i32 (SplatPat_simm5 simm5:$imm)), CC)),
++            (!cast<LAInst>(Inst#"_W") LASX256:$xj, simm5:$imm)>;
++  def : Pat<(v4i64 (setcc (v4i64 LASX256:$xj),
++                          (v4i64 (SplatPat_simm5 simm5:$imm)), CC)),
++            (!cast<LAInst>(Inst#"_D") LASX256:$xj, simm5:$imm)>;
++}
++
++multiclass PatCCXrUimm5<CondCode CC, string Inst> {
++  def : Pat<(v32i8 (setcc (v32i8 LASX256:$xj),
++                          (v32i8 (SplatPat_uimm5 uimm5:$imm)), CC)),
++            (!cast<LAInst>(Inst#"_BU") LASX256:$xj, uimm5:$imm)>;
++  def : Pat<(v16i16 (setcc (v16i16 LASX256:$xj),
++                           (v16i16 (SplatPat_uimm5 uimm5:$imm)), CC)),
++            (!cast<LAInst>(Inst#"_HU") LASX256:$xj, uimm5:$imm)>;
++  def : Pat<(v8i32 (setcc (v8i32 LASX256:$xj),
++                          (v8i32 (SplatPat_uimm5 uimm5:$imm)), CC)),
++            (!cast<LAInst>(Inst#"_WU") LASX256:$xj, uimm5:$imm)>;
++  def : Pat<(v4i64 (setcc (v4i64 LASX256:$xj),
++                          (v4i64 (SplatPat_uimm5 uimm5:$imm)), CC)),
++            (!cast<LAInst>(Inst#"_DU") LASX256:$xj, uimm5:$imm)>;
++}
++
++multiclass PatCCXrXr<CondCode CC, string Inst> {
++  def : Pat<(v32i8 (setcc (v32i8 LASX256:$xj), (v32i8 LASX256:$xk), CC)),
++            (!cast<LAInst>(Inst#"_B") LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(v16i16 (setcc (v16i16 LASX256:$xj), (v16i16 LASX256:$xk), CC)),
++            (!cast<LAInst>(Inst#"_H") LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(v8i32 (setcc (v8i32 LASX256:$xj), (v8i32 LASX256:$xk), CC)),
++            (!cast<LAInst>(Inst#"_W") LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(v4i64 (setcc (v4i64 LASX256:$xj), (v4i64 LASX256:$xk), CC)),
++            (!cast<LAInst>(Inst#"_D") LASX256:$xj, LASX256:$xk)>;
++}
++
++multiclass PatCCXrXrU<CondCode CC, string Inst> {
++  def : Pat<(v32i8 (setcc (v32i8 LASX256:$xj), (v32i8 LASX256:$xk), CC)),
++            (!cast<LAInst>(Inst#"_BU") LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(v16i16 (setcc (v16i16 LASX256:$xj), (v16i16 LASX256:$xk), CC)),
++            (!cast<LAInst>(Inst#"_HU") LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(v8i32 (setcc (v8i32 LASX256:$xj), (v8i32 LASX256:$xk), CC)),
++            (!cast<LAInst>(Inst#"_WU") LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(v4i64 (setcc (v4i64 LASX256:$xj), (v4i64 LASX256:$xk), CC)),
++            (!cast<LAInst>(Inst#"_DU") LASX256:$xj, LASX256:$xk)>;
++}
++
++multiclass PatCCXrXrF<CondCode CC, string Inst> {
++  def : Pat<(v8i32 (setcc (v8f32 LASX256:$xj), (v8f32 LASX256:$xk), CC)),
++            (!cast<LAInst>(Inst#"_S") LASX256:$xj, LASX256:$xk)>;
++  def : Pat<(v4i64 (setcc (v4f64 LASX256:$xj), (v4f64 LASX256:$xk), CC)),
++            (!cast<LAInst>(Inst#"_D") LASX256:$xj, LASX256:$xk)>;
++}
++
+ let Predicates = [HasExtLASX] in {
+ 
+ // XVADD_{B/H/W/D}
+@@ -1389,6 +1448,42 @@ def : Pat<(fma v8f32:$xj, v8f32:$xk, v8f32:$xa),
+ def : Pat<(fma v4f64:$xj, v4f64:$xk, v4f64:$xa),
+           (XVFMADD_D v4f64:$xj, v4f64:$xk, v4f64:$xa)>;
+ 
++// XVSEQ[I]_{B/H/W/D}
++defm : PatCCXrSimm5<SETEQ, "XVSEQI">;
++defm : PatCCXrXr<SETEQ, "XVSEQ">;
++
++// XVSLE[I]_{B/H/W/D}[U]
++defm : PatCCXrSimm5<SETLE, "XVSLEI">;
++defm : PatCCXrUimm5<SETULE, "XVSLEI">;
++defm : PatCCXrXr<SETLE, "XVSLE">;
++defm : PatCCXrXrU<SETULE, "XVSLE">;
++
++// XVSLT[I]_{B/H/W/D}[U]
++defm : PatCCXrSimm5<SETLT, "XVSLTI">;
++defm : PatCCXrUimm5<SETULT, "XVSLTI">;
++defm : PatCCXrXr<SETLT, "XVSLT">;
++defm : PatCCXrXrU<SETULT, "XVSLT">;
++
++// XVFCMP.cond.{S/D}
++defm : PatCCXrXrF<SETEQ, "XVFCMP_CEQ">;
++defm : PatCCXrXrF<SETOEQ, "XVFCMP_CEQ">;
++defm : PatCCXrXrF<SETUEQ, "XVFCMP_CUEQ">;
++
++defm : PatCCXrXrF<SETLE, "XVFCMP_CLE">;
++defm : PatCCXrXrF<SETOLE, "XVFCMP_CLE">;
++defm : PatCCXrXrF<SETULE, "XVFCMP_CULE">;
++
++defm : PatCCXrXrF<SETLT, "XVFCMP_CLT">;
++defm : PatCCXrXrF<SETOLT, "XVFCMP_CLT">;
++defm : PatCCXrXrF<SETULT, "XVFCMP_CULT">;
++
++defm : PatCCXrXrF<SETNE, "XVFCMP_CNE">;
++defm : PatCCXrXrF<SETONE, "XVFCMP_CNE">;
++defm : PatCCXrXrF<SETUNE, "XVFCMP_CUNE">;
++
++defm : PatCCXrXrF<SETO, "XVFCMP_COR">;
++defm : PatCCXrXrF<SETUO, "XVFCMP_CUN">;
++
+ // PseudoXVINSGR2VR_{B/H}
+ def : Pat<(vector_insert v32i8:$xd, GRLenVT:$rj, uimm5:$imm),
+           (PseudoXVINSGR2VR_B v32i8:$xd, GRLenVT:$rj, uimm5:$imm)>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index 5800ff6f6266..ff21c6681271 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -1261,6 +1261,65 @@ multiclass PatShiftVrUimm<SDPatternOperator OpNode, string Inst> {
+             (!cast<LAInst>(Inst#"_D") LSX128:$vj, uimm6:$imm)>;
+ }
+ 
++multiclass PatCCVrSimm5<CondCode CC, string Inst> {
++  def : Pat<(v16i8 (setcc (v16i8 LSX128:$vj),
++                          (v16i8 (SplatPat_simm5 simm5:$imm)), CC)),
++            (!cast<LAInst>(Inst#"_B") LSX128:$vj, simm5:$imm)>;
++  def : Pat<(v8i16 (setcc (v8i16 LSX128:$vj),
++                          (v8i16 (SplatPat_simm5 simm5:$imm)), CC)),
++            (!cast<LAInst>(Inst#"_H") LSX128:$vj, simm5:$imm)>;
++  def : Pat<(v4i32 (setcc (v4i32 LSX128:$vj),
++                          (v4i32 (SplatPat_simm5 simm5:$imm)), CC)),
++            (!cast<LAInst>(Inst#"_W") LSX128:$vj, simm5:$imm)>;
++  def : Pat<(v2i64 (setcc (v2i64 LSX128:$vj),
++                          (v2i64 (SplatPat_simm5 simm5:$imm)), CC)),
++            (!cast<LAInst>(Inst#"_D") LSX128:$vj, simm5:$imm)>;
++}
++
++multiclass PatCCVrUimm5<CondCode CC, string Inst> {
++  def : Pat<(v16i8 (setcc (v16i8 LSX128:$vj),
++                          (v16i8 (SplatPat_uimm5 uimm5:$imm)), CC)),
++            (!cast<LAInst>(Inst#"_BU") LSX128:$vj, uimm5:$imm)>;
++  def : Pat<(v8i16 (setcc (v8i16 LSX128:$vj),
++                          (v8i16 (SplatPat_uimm5 uimm5:$imm)), CC)),
++            (!cast<LAInst>(Inst#"_HU") LSX128:$vj, uimm5:$imm)>;
++  def : Pat<(v4i32 (setcc (v4i32 LSX128:$vj),
++                          (v4i32 (SplatPat_uimm5 uimm5:$imm)), CC)),
++            (!cast<LAInst>(Inst#"_WU") LSX128:$vj, uimm5:$imm)>;
++  def : Pat<(v2i64 (setcc (v2i64 LSX128:$vj),
++                          (v2i64 (SplatPat_uimm5 uimm5:$imm)), CC)),
++            (!cast<LAInst>(Inst#"_DU") LSX128:$vj, uimm5:$imm)>;
++}
++
++multiclass PatCCVrVr<CondCode CC, string Inst> {
++  def : Pat<(v16i8 (setcc (v16i8 LSX128:$vj), (v16i8 LSX128:$vk), CC)),
++            (!cast<LAInst>(Inst#"_B") LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(v8i16 (setcc (v8i16 LSX128:$vj), (v8i16 LSX128:$vk), CC)),
++            (!cast<LAInst>(Inst#"_H") LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(v4i32 (setcc (v4i32 LSX128:$vj), (v4i32 LSX128:$vk), CC)),
++            (!cast<LAInst>(Inst#"_W") LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(v2i64 (setcc (v2i64 LSX128:$vj), (v2i64 LSX128:$vk), CC)),
++            (!cast<LAInst>(Inst#"_D") LSX128:$vj, LSX128:$vk)>;
++}
++
++multiclass PatCCVrVrU<CondCode CC, string Inst> {
++  def : Pat<(v16i8 (setcc (v16i8 LSX128:$vj), (v16i8 LSX128:$vk), CC)),
++            (!cast<LAInst>(Inst#"_BU") LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(v8i16 (setcc (v8i16 LSX128:$vj), (v8i16 LSX128:$vk), CC)),
++            (!cast<LAInst>(Inst#"_HU") LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(v4i32 (setcc (v4i32 LSX128:$vj), (v4i32 LSX128:$vk), CC)),
++            (!cast<LAInst>(Inst#"_WU") LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(v2i64 (setcc (v2i64 LSX128:$vj), (v2i64 LSX128:$vk), CC)),
++            (!cast<LAInst>(Inst#"_DU") LSX128:$vj, LSX128:$vk)>;
++}
++
++multiclass PatCCVrVrF<CondCode CC, string Inst> {
++  def : Pat<(v4i32 (setcc (v4f32 LSX128:$vj), (v4f32 LSX128:$vk), CC)),
++            (!cast<LAInst>(Inst#"_S") LSX128:$vj, LSX128:$vk)>;
++  def : Pat<(v2i64 (setcc (v2f64 LSX128:$vj), (v2f64 LSX128:$vk), CC)),
++            (!cast<LAInst>(Inst#"_D") LSX128:$vj, LSX128:$vk)>;
++}
++
+ let Predicates = [HasExtLSX] in {
+ 
+ // VADD_{B/H/W/D}
+@@ -1466,6 +1525,42 @@ def : Pat<(fma v4f32:$vj, v4f32:$vk, v4f32:$va),
+ def : Pat<(fma v2f64:$vj, v2f64:$vk, v2f64:$va),
+           (VFMADD_D v2f64:$vj, v2f64:$vk, v2f64:$va)>;
+ 
++// VSEQ[I]_{B/H/W/D}
++defm : PatCCVrSimm5<SETEQ, "VSEQI">;
++defm : PatCCVrVr<SETEQ, "VSEQ">;
++
++// VSLE[I]_{B/H/W/D}[U]
++defm : PatCCVrSimm5<SETLE, "VSLEI">;
++defm : PatCCVrUimm5<SETULE, "VSLEI">;
++defm : PatCCVrVr<SETLE, "VSLE">;
++defm : PatCCVrVrU<SETULE, "VSLE">;
++
++// VSLT[I]_{B/H/W/D}[U]
++defm : PatCCVrSimm5<SETLT, "VSLTI">;
++defm : PatCCVrUimm5<SETULT, "VSLTI">;
++defm : PatCCVrVr<SETLT, "VSLT">;
++defm : PatCCVrVrU<SETULT, "VSLT">;
++
++// VFCMP.cond.{S/D}
++defm : PatCCVrVrF<SETEQ, "VFCMP_CEQ">;
++defm : PatCCVrVrF<SETOEQ, "VFCMP_CEQ">;
++defm : PatCCVrVrF<SETUEQ, "VFCMP_CUEQ">;
++
++defm : PatCCVrVrF<SETLE, "VFCMP_CLE">;
++defm : PatCCVrVrF<SETOLE, "VFCMP_CLE">;
++defm : PatCCVrVrF<SETULE, "VFCMP_CULE">;
++
++defm : PatCCVrVrF<SETLT, "VFCMP_CLT">;
++defm : PatCCVrVrF<SETOLT, "VFCMP_CLT">;
++defm : PatCCVrVrF<SETULT, "VFCMP_CULT">;
++
++defm : PatCCVrVrF<SETNE, "VFCMP_CNE">;
++defm : PatCCVrVrF<SETONE, "VFCMP_CNE">;
++defm : PatCCVrVrF<SETUNE, "VFCMP_CUNE">;
++
++defm : PatCCVrVrF<SETO, "VFCMP_COR">;
++defm : PatCCVrVrF<SETUO, "VFCMP_CUN">;
++
+ // VINSGR2VR_{B/H/W/D}
+ def : Pat<(vector_insert v16i8:$vd, GRLenVT:$rj, uimm4:$imm),
+           (VINSGR2VR_B v16i8:$vd, GRLenVT:$rj, uimm4:$imm)>;
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fcmp.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fcmp.ll
+new file mode 100644
+index 000000000000..ef67dbc100c0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fcmp.ll
+@@ -0,0 +1,692 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++;; TREU
++define void @v8f32_fcmp_true(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_true:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvrepli.b $xr0, -1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp true <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++;; FALSE
++define void @v4f64_fcmp_false(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_false:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvrepli.b $xr0, 0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp false <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETOEQ
++define void @v8f32_fcmp_oeq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_oeq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.ceq.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp oeq <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_oeq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_oeq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.ceq.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp oeq <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETUEQ
++define void @v8f32_fcmp_ueq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_ueq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cueq.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp ueq <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_ueq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_ueq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cueq.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp ueq <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETEQ
++define void @v8f32_fcmp_eq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_eq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.ceq.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp fast oeq <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_eq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_eq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.ceq.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp fast ueq <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETOLE
++define void @v8f32_fcmp_ole(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_ole:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cle.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp ole <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_ole(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_ole:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cle.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp ole <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETULE
++define void @v8f32_fcmp_ule(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_ule:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cule.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp ule <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_ule(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_ule:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cule.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp ule <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETLE
++define void @v8f32_fcmp_le(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_le:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cle.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp fast ole <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_le(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_le:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cle.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp fast ule <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETOLT
++define void @v8f32_fcmp_olt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_olt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.clt.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp olt <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_olt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_olt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.clt.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp olt <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETULT
++define void @v8f32_fcmp_ult(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_ult:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cult.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp ult <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_ult(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_ult:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cult.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp ult <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETLT
++define void @v8f32_fcmp_lt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_lt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.clt.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp fast olt <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_lt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_lt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.clt.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp fast ult <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETONE
++define void @v8f32_fcmp_one(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_one:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cne.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp one <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_one(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_one:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cne.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp one <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETUNE
++define void @v8f32_fcmp_une(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_une:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cune.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp une <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_une(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_une:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cune.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp une <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETNE
++define void @v8f32_fcmp_ne(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_ne:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cne.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp fast one <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_ne(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_ne:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cne.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp fast une <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETO
++define void @v8f32_fcmp_ord(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_ord:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cor.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp ord <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_ord(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_ord:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cor.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp ord <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETUO
++define void @v8f32_fcmp_uno(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_uno:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cun.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp uno <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_uno(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_uno:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvfcmp.cun.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp uno <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETOGT
++define void @v8f32_fcmp_ogt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_ogt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvfcmp.clt.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp ogt <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_ogt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_ogt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvfcmp.clt.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp ogt <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETUGT
++define void @v8f32_fcmp_ugt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_ugt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvfcmp.cult.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp ugt <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_ugt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_ugt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvfcmp.cult.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp ugt <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETGT
++define void @v8f32_fcmp_gt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_gt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvfcmp.clt.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp fast ogt <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_gt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_gt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvfcmp.clt.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp fast ugt <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETOGE
++define void @v8f32_fcmp_oge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_oge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvfcmp.cle.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp oge <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_oge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_oge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvfcmp.cle.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp oge <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETUGE
++define void @v8f32_fcmp_uge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_uge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvfcmp.cule.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp uge <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_uge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_uge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvfcmp.cule.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp uge <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETGE
++define void @v8f32_fcmp_ge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8f32_fcmp_ge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvfcmp.cle.s $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %cmp = fcmp fast oge <8 x float> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4f64_fcmp_ge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f64_fcmp_ge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvfcmp.cle.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %cmp = fcmp fast uge <4 x double> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/icmp.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/icmp.ll
+new file mode 100644
+index 000000000000..6693fe0f6ec7
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/icmp.ll
+@@ -0,0 +1,939 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++;; SETEQ
++define void @v32i8_icmp_eq_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v32i8_icmp_eq_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvseqi.b $xr0, $xr0, 15
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <32 x i8>, ptr %a0
++  %cmp = icmp eq <32 x i8> %v0, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
++  %ext = sext <32 x i1> %cmp to <32 x i8>
++  store <32 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v32i8_icmp_eq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v32i8_icmp_eq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvseq.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %cmp = icmp eq <32 x i8> %v0, %v1
++  %ext = sext <32 x i1> %cmp to <32 x i8>
++  store <32 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v16i16_icmp_eq_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v16i16_icmp_eq_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvseqi.h $xr0, $xr0, 15
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i16>, ptr %a0
++  %cmp = icmp eq <16 x i16> %v0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
++  %ext = sext <16 x i1> %cmp to <16 x i16>
++  store <16 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v16i16_icmp_eq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i16_icmp_eq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvseq.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %cmp = icmp eq <16 x i16> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i16>
++  store <16 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v8i32_icmp_eq_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v8i32_icmp_eq_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvseqi.w $xr0, $xr0, 15
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %a0
++  %cmp = icmp eq <8 x i32> %v0, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v8i32_icmp_eq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i32_icmp_eq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvseq.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %cmp = icmp eq <8 x i32> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4i64_icmp_eq_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v4i64_icmp_eq_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvseqi.d $xr0, $xr0, 15
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %a0
++  %cmp = icmp eq <4 x i64> %v0, <i64 15, i64 15, i64 15, i64 15>
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++define void @v4i64_icmp_eq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i64_icmp_eq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvseq.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %cmp = icmp eq <4 x i64> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETLE
++define void @v32i8_icmp_sle_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v32i8_icmp_sle_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslei.b $xr0, $xr0, 15
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <32 x i8>, ptr %a0
++  %cmp = icmp sle <32 x i8> %v0, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
++  %ext = sext <32 x i1> %cmp to <32 x i8>
++  store <32 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v32i8_icmp_sle(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v32i8_icmp_sle:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsle.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %cmp = icmp sle <32 x i8> %v0, %v1
++  %ext = sext <32 x i1> %cmp to <32 x i8>
++  store <32 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v16i16_icmp_sle_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v16i16_icmp_sle_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslei.h $xr0, $xr0, 15
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i16>, ptr %a0
++  %cmp = icmp sle <16 x i16> %v0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
++  %ext = sext <16 x i1> %cmp to <16 x i16>
++  store <16 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v16i16_icmp_sle(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i16_icmp_sle:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsle.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %cmp = icmp sle <16 x i16> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i16>
++  store <16 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v8i32_icmp_sle_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v8i32_icmp_sle_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslei.w $xr0, $xr0, 15
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %a0
++  %cmp = icmp sle <8 x i32> %v0, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v8i32_icmp_sle(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i32_icmp_sle:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsle.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %cmp = icmp sle <8 x i32> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4i64_icmp_sle_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v4i64_icmp_sle_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslei.d $xr0, $xr0, 15
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %a0
++  %cmp = icmp sle <4 x i64> %v0, <i64 15, i64 15, i64 15, i64 15>
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++define void @v4i64_icmp_sle(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i64_icmp_sle:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsle.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %cmp = icmp sle <4 x i64> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETULE
++define void @v32i8_icmp_ule_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v32i8_icmp_ule_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslei.bu $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <32 x i8>, ptr %a0
++  %cmp = icmp ule <32 x i8> %v0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
++  %ext = sext <32 x i1> %cmp to <32 x i8>
++  store <32 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v32i8_icmp_ule(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v32i8_icmp_ule:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsle.bu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %cmp = icmp ule <32 x i8> %v0, %v1
++  %ext = sext <32 x i1> %cmp to <32 x i8>
++  store <32 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v16i16_icmp_ule_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v16i16_icmp_ule_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslei.hu $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i16>, ptr %a0
++  %cmp = icmp ule <16 x i16> %v0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
++  %ext = sext <16 x i1> %cmp to <16 x i16>
++  store <16 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v16i16_icmp_ule(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i16_icmp_ule:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsle.hu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %cmp = icmp ule <16 x i16> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i16>
++  store <16 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v8i32_icmp_ule_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v8i32_icmp_ule_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslei.wu $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %a0
++  %cmp = icmp ule <8 x i32> %v0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v8i32_icmp_ule(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i32_icmp_ule:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsle.wu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %cmp = icmp ule <8 x i32> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4i64_icmp_ule_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v4i64_icmp_ule_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslei.du $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %a0
++  %cmp = icmp ule <4 x i64> %v0, <i64 31, i64 31, i64 31, i64 31>
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++define void @v4i64_icmp_ule(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i64_icmp_ule:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvsle.du $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %cmp = icmp ule <4 x i64> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETLT
++define void @v32i8_icmp_slt_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v32i8_icmp_slt_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslti.b $xr0, $xr0, 15
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <32 x i8>, ptr %a0
++  %cmp = icmp slt <32 x i8> %v0, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
++  %ext = sext <32 x i1> %cmp to <32 x i8>
++  store <32 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v32i8_icmp_slt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v32i8_icmp_slt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvslt.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %cmp = icmp slt <32 x i8> %v0, %v1
++  %ext = sext <32 x i1> %cmp to <32 x i8>
++  store <32 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v16i16_icmp_slt_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v16i16_icmp_slt_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslti.h $xr0, $xr0, 15
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i16>, ptr %a0
++  %cmp = icmp slt <16 x i16> %v0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
++  %ext = sext <16 x i1> %cmp to <16 x i16>
++  store <16 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v16i16_icmp_slt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i16_icmp_slt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvslt.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %cmp = icmp slt <16 x i16> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i16>
++  store <16 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v8i32_icmp_slt_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v8i32_icmp_slt_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslti.w $xr0, $xr0, 15
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %a0
++  %cmp = icmp slt <8 x i32> %v0, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v8i32_icmp_slt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i32_icmp_slt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvslt.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %cmp = icmp slt <8 x i32> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4i64_icmp_slt_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v4i64_icmp_slt_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslti.d $xr0, $xr0, 15
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %a0
++  %cmp = icmp slt <4 x i64> %v0, <i64 15, i64 15, i64 15, i64 15>
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++define void @v4i64_icmp_slt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i64_icmp_slt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvslt.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %cmp = icmp slt <4 x i64> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETULT
++define void @v32i8_icmp_ult_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v32i8_icmp_ult_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslti.bu $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <32 x i8>, ptr %a0
++  %cmp = icmp ult <32 x i8> %v0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
++  %ext = sext <32 x i1> %cmp to <32 x i8>
++  store <32 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v32i8_icmp_ult(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v32i8_icmp_ult:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvslt.bu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %cmp = icmp ult <32 x i8> %v0, %v1
++  %ext = sext <32 x i1> %cmp to <32 x i8>
++  store <32 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v16i16_icmp_ult_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v16i16_icmp_ult_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslti.hu $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i16>, ptr %a0
++  %cmp = icmp ult <16 x i16> %v0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
++  %ext = sext <16 x i1> %cmp to <16 x i16>
++  store <16 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v16i16_icmp_ult(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i16_icmp_ult:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvslt.hu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %cmp = icmp ult <16 x i16> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i16>
++  store <16 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v8i32_icmp_ult_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v8i32_icmp_ult_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslti.wu $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %a0
++  %cmp = icmp ult <8 x i32> %v0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v8i32_icmp_ult(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i32_icmp_ult:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvslt.wu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %cmp = icmp ult <8 x i32> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4i64_icmp_ult_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v4i64_icmp_ult_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvslti.du $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %a0
++  %cmp = icmp ult <4 x i64> %v0, <i64 31, i64 31, i64 31, i64 31>
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++define void @v4i64_icmp_ult(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i64_icmp_ult:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvslt.du $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %cmp = icmp ult <4 x i64> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETNE
++define void @v32i8_icmp_ne(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v32i8_icmp_ne:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvseq.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvxori.b $xr0, $xr0, 255
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %cmp = icmp ne <32 x i8> %v0, %v1
++  %ext = sext <32 x i1> %cmp to <32 x i8>
++  store <32 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v16i16_icmp_ne(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i16_icmp_ne:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvseq.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvrepli.b $xr1, -1
++; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %cmp = icmp ne <16 x i16> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i16>
++  store <16 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v8i32_icmp_ne(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i32_icmp_ne:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvseq.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvrepli.b $xr1, -1
++; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %cmp = icmp ne <8 x i32> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4i64_icmp_ne(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i64_icmp_ne:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a2, 0
++; CHECK-NEXT:    xvld $xr1, $a1, 0
++; CHECK-NEXT:    xvseq.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvrepli.b $xr1, -1
++; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %cmp = icmp ne <4 x i64> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETGE
++define void @v32i8_icmp_sge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v32i8_icmp_sge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvsle.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %cmp = icmp sge <32 x i8> %v0, %v1
++  %ext = sext <32 x i1> %cmp to <32 x i8>
++  store <32 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v16i16_icmp_sge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i16_icmp_sge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvsle.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %cmp = icmp sge <16 x i16> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i16>
++  store <16 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v8i32_icmp_sge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i32_icmp_sge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvsle.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %cmp = icmp sge <8 x i32> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4i64_icmp_sge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i64_icmp_sge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvsle.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %cmp = icmp sge <4 x i64> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETUGE
++define void @v32i8_icmp_uge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v32i8_icmp_uge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvsle.bu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %cmp = icmp uge <32 x i8> %v0, %v1
++  %ext = sext <32 x i1> %cmp to <32 x i8>
++  store <32 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v16i16_icmp_uge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i16_icmp_uge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvsle.hu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %cmp = icmp uge <16 x i16> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i16>
++  store <16 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v8i32_icmp_uge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i32_icmp_uge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvsle.wu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %cmp = icmp uge <8 x i32> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4i64_icmp_uge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i64_icmp_uge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvsle.du $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %cmp = icmp uge <4 x i64> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETGT
++define void @v32i8_icmp_sgt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v32i8_icmp_sgt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvslt.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %cmp = icmp sgt <32 x i8> %v0, %v1
++  %ext = sext <32 x i1> %cmp to <32 x i8>
++  store <32 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v16i16_icmp_sgt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i16_icmp_sgt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvslt.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %cmp = icmp sgt <16 x i16> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i16>
++  store <16 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v8i32_icmp_sgt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i32_icmp_sgt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvslt.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %cmp = icmp sgt <8 x i32> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4i64_icmp_sgt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i64_icmp_sgt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvslt.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %cmp = icmp sgt <4 x i64> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETUGT
++define void @v32i8_icmp_ugt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v32i8_icmp_ugt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvslt.bu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <32 x i8>, ptr %a0
++  %v1 = load <32 x i8>, ptr %a1
++  %cmp = icmp ugt <32 x i8> %v0, %v1
++  %ext = sext <32 x i1> %cmp to <32 x i8>
++  store <32 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v16i16_icmp_ugt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i16_icmp_ugt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvslt.hu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i16>, ptr %a0
++  %v1 = load <16 x i16>, ptr %a1
++  %cmp = icmp ugt <16 x i16> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i16>
++  store <16 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v8i32_icmp_ugt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i32_icmp_ugt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvslt.wu $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %a0
++  %v1 = load <8 x i32>, ptr %a1
++  %cmp = icmp ugt <8 x i32> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i32>
++  store <8 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4i64_icmp_ugt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i64_icmp_ugt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvld $xr1, $a2, 0
++; CHECK-NEXT:    xvslt.du $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %a0
++  %v1 = load <4 x i64>, ptr %a1
++  %cmp = icmp ugt <4 x i64> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i64>
++  store <4 x i64> %ext, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fcmp.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fcmp.ll
+new file mode 100644
+index 000000000000..53fbf0b2f86f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fcmp.ll
+@@ -0,0 +1,692 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++;; TREU
++define void @v4f32_fcmp_true(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_true:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vrepli.b $vr0, -1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp true <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++;; FALSE
++define void @v2f64_fcmp_false(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_false:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vrepli.b $vr0, 0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp false <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETOEQ
++define void @v4f32_fcmp_oeq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_oeq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.ceq.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp oeq <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_oeq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_oeq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.ceq.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp oeq <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETUEQ
++define void @v4f32_fcmp_ueq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_ueq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cueq.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp ueq <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_ueq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_ueq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cueq.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp ueq <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETEQ
++define void @v4f32_fcmp_eq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_eq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.ceq.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp fast oeq <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_eq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_eq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.ceq.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp fast ueq <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETOLE
++define void @v4f32_fcmp_ole(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_ole:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cle.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp ole <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_ole(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_ole:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cle.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp ole <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETULE
++define void @v4f32_fcmp_ule(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_ule:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cule.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp ule <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_ule(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_ule:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cule.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp ule <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETLE
++define void @v4f32_fcmp_le(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_le:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cle.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp fast ole <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_le(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_le:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cle.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp fast ule <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETOLT
++define void @v4f32_fcmp_olt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_olt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.clt.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp olt <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_olt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_olt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.clt.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp olt <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETULT
++define void @v4f32_fcmp_ult(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_ult:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cult.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp ult <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_ult(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_ult:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cult.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp ult <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETLT
++define void @v4f32_fcmp_lt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_lt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.clt.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp fast olt <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_lt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_lt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.clt.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp fast ult <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETONE
++define void @v4f32_fcmp_one(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_one:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cne.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp one <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_one(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_one:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cne.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp one <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETUNE
++define void @v4f32_fcmp_une(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_une:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cune.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp une <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_une(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_une:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cune.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp une <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETNE
++define void @v4f32_fcmp_ne(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_ne:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cne.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp fast one <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_ne(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_ne:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cne.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp fast une <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETO
++define void @v4f32_fcmp_ord(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_ord:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cor.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp ord <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_ord(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_ord:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cor.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp ord <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETUO
++define void @v4f32_fcmp_uno(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_uno:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cun.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp uno <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_uno(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_uno:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vfcmp.cun.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp uno <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETOGT
++define void @v4f32_fcmp_ogt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_ogt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vfcmp.clt.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp ogt <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_ogt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_ogt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vfcmp.clt.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp ogt <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETUGT
++define void @v4f32_fcmp_ugt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_ugt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vfcmp.cult.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp ugt <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_ugt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_ugt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vfcmp.cult.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp ugt <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETGT
++define void @v4f32_fcmp_gt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_gt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vfcmp.clt.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp fast ogt <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_gt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_gt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vfcmp.clt.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp fast ugt <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETOGE
++define void @v4f32_fcmp_oge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_oge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vfcmp.cle.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp oge <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_oge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_oge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vfcmp.cle.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp oge <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETUGE
++define void @v4f32_fcmp_uge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_uge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vfcmp.cule.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp uge <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_uge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_uge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vfcmp.cule.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp uge <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETGE
++define void @v4f32_fcmp_ge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4f32_fcmp_ge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vfcmp.cle.s $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %cmp = fcmp fast oge <4 x float> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2f64_fcmp_ge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2f64_fcmp_ge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vfcmp.cle.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %cmp = fcmp fast uge <2 x double> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/icmp.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/icmp.ll
+new file mode 100644
+index 000000000000..448f3fa6c6e0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/icmp.ll
+@@ -0,0 +1,939 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++;; SETEQ
++define void @v16i8_icmp_eq_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v16i8_icmp_eq_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vseqi.b $vr0, $vr0, 15
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i8>, ptr %a0
++  %cmp = icmp eq <16 x i8> %v0, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
++  %ext = sext <16 x i1> %cmp to <16 x i8>
++  store <16 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v16i8_icmp_eq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i8_icmp_eq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vseq.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %cmp = icmp eq <16 x i8> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i8>
++  store <16 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v8i16_icmp_eq_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v8i16_icmp_eq_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vseqi.h $vr0, $vr0, 15
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i16>, ptr %a0
++  %cmp = icmp eq <8 x i16> %v0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
++  %ext = sext <8 x i1> %cmp to <8 x i16>
++  store <8 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v8i16_icmp_eq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i16_icmp_eq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vseq.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %cmp = icmp eq <8 x i16> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i16>
++  store <8 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v4i32_icmp_eq_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v4i32_icmp_eq_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vseqi.w $vr0, $vr0, 15
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %a0
++  %cmp = icmp eq <4 x i32> %v0, <i32 15, i32 15, i32 15, i32 15>
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4i32_icmp_eq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i32_icmp_eq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vseq.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %cmp = icmp eq <4 x i32> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2i64_icmp_eq_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v2i64_icmp_eq_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vseqi.d $vr0, $vr0, 15
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %a0
++  %cmp = icmp eq <2 x i64> %v0, <i64 15, i64 15>
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++define void @v2i64_icmp_eq(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2i64_icmp_eq:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vseq.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %cmp = icmp eq <2 x i64> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETLE
++define void @v16i8_icmp_sle_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v16i8_icmp_sle_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslei.b $vr0, $vr0, 15
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i8>, ptr %a0
++  %cmp = icmp sle <16 x i8> %v0, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
++  %ext = sext <16 x i1> %cmp to <16 x i8>
++  store <16 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v16i8_icmp_sle(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i8_icmp_sle:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsle.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %cmp = icmp sle <16 x i8> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i8>
++  store <16 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v8i16_icmp_sle_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v8i16_icmp_sle_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslei.h $vr0, $vr0, 15
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i16>, ptr %a0
++  %cmp = icmp sle <8 x i16> %v0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
++  %ext = sext <8 x i1> %cmp to <8 x i16>
++  store <8 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v8i16_icmp_sle(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i16_icmp_sle:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsle.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %cmp = icmp sle <8 x i16> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i16>
++  store <8 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v4i32_icmp_sle_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v4i32_icmp_sle_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslei.w $vr0, $vr0, 15
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %a0
++  %cmp = icmp sle <4 x i32> %v0, <i32 15, i32 15, i32 15, i32 15>
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4i32_icmp_sle(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i32_icmp_sle:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsle.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %cmp = icmp sle <4 x i32> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2i64_icmp_sle_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v2i64_icmp_sle_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslei.d $vr0, $vr0, 15
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %a0
++  %cmp = icmp sle <2 x i64> %v0, <i64 15, i64 15>
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++define void @v2i64_icmp_sle(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2i64_icmp_sle:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsle.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %cmp = icmp sle <2 x i64> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETULE
++define void @v16i8_icmp_ule_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v16i8_icmp_ule_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslei.bu $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i8>, ptr %a0
++  %cmp = icmp ule <16 x i8> %v0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
++  %ext = sext <16 x i1> %cmp to <16 x i8>
++  store <16 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v16i8_icmp_ule(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i8_icmp_ule:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsle.bu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %cmp = icmp ule <16 x i8> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i8>
++  store <16 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v8i16_icmp_ule_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v8i16_icmp_ule_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslei.hu $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i16>, ptr %a0
++  %cmp = icmp ule <8 x i16> %v0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
++  %ext = sext <8 x i1> %cmp to <8 x i16>
++  store <8 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v8i16_icmp_ule(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i16_icmp_ule:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsle.hu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %cmp = icmp ule <8 x i16> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i16>
++  store <8 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v4i32_icmp_ule_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v4i32_icmp_ule_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslei.wu $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %a0
++  %cmp = icmp ule <4 x i32> %v0, <i32 31, i32 31, i32 31, i32 31>
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4i32_icmp_ule(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i32_icmp_ule:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsle.wu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %cmp = icmp ule <4 x i32> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2i64_icmp_ule_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v2i64_icmp_ule_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslei.du $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %a0
++  %cmp = icmp ule <2 x i64> %v0, <i64 31, i64 31>
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++define void @v2i64_icmp_ule(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2i64_icmp_ule:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vsle.du $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %cmp = icmp ule <2 x i64> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETLT
++define void @v16i8_icmp_slt_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v16i8_icmp_slt_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslti.b $vr0, $vr0, 15
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i8>, ptr %a0
++  %cmp = icmp slt <16 x i8> %v0, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
++  %ext = sext <16 x i1> %cmp to <16 x i8>
++  store <16 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v16i8_icmp_slt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i8_icmp_slt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vslt.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %cmp = icmp slt <16 x i8> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i8>
++  store <16 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v8i16_icmp_slt_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v8i16_icmp_slt_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslti.h $vr0, $vr0, 15
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i16>, ptr %a0
++  %cmp = icmp slt <8 x i16> %v0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
++  %ext = sext <8 x i1> %cmp to <8 x i16>
++  store <8 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v8i16_icmp_slt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i16_icmp_slt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vslt.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %cmp = icmp slt <8 x i16> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i16>
++  store <8 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v4i32_icmp_slt_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v4i32_icmp_slt_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslti.w $vr0, $vr0, 15
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %a0
++  %cmp = icmp slt <4 x i32> %v0, <i32 15, i32 15, i32 15, i32 15>
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4i32_icmp_slt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i32_icmp_slt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vslt.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %cmp = icmp slt <4 x i32> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2i64_icmp_slt_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v2i64_icmp_slt_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslti.d $vr0, $vr0, 15
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %a0
++  %cmp = icmp slt <2 x i64> %v0, <i64 15, i64 15>
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++define void @v2i64_icmp_slt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2i64_icmp_slt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vslt.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %cmp = icmp slt <2 x i64> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; SETULT
++define void @v16i8_icmp_ult_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v16i8_icmp_ult_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslti.bu $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i8>, ptr %a0
++  %cmp = icmp ult <16 x i8> %v0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
++  %ext = sext <16 x i1> %cmp to <16 x i8>
++  store <16 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v16i8_icmp_ult(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i8_icmp_ult:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vslt.bu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %cmp = icmp ult <16 x i8> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i8>
++  store <16 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v8i16_icmp_ult_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v8i16_icmp_ult_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslti.hu $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i16>, ptr %a0
++  %cmp = icmp ult <8 x i16> %v0, <i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31>
++  %ext = sext <8 x i1> %cmp to <8 x i16>
++  store <8 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v8i16_icmp_ult(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i16_icmp_ult:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vslt.hu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %cmp = icmp ult <8 x i16> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i16>
++  store <8 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v4i32_icmp_ult_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v4i32_icmp_ult_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslti.wu $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %a0
++  %cmp = icmp ult <4 x i32> %v0, <i32 31, i32 31, i32 31, i32 31>
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v4i32_icmp_ult(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i32_icmp_ult:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vslt.wu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %cmp = icmp ult <4 x i32> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2i64_icmp_ult_imm(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: v2i64_icmp_ult_imm:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vslti.du $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %a0
++  %cmp = icmp ult <2 x i64> %v0, <i64 31, i64 31>
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++define void @v2i64_icmp_ult(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2i64_icmp_ult:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vslt.du $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %cmp = icmp ult <2 x i64> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETNE
++define void @v16i8_icmp_ne(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i8_icmp_ne:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vseq.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    vxori.b $vr0, $vr0, 255
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %cmp = icmp ne <16 x i8> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i8>
++  store <16 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v8i16_icmp_ne(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i16_icmp_ne:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vseq.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    vrepli.b $vr1, -1
++; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %cmp = icmp ne <8 x i16> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i16>
++  store <8 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v4i32_icmp_ne(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i32_icmp_ne:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vseq.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    vrepli.b $vr1, -1
++; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %cmp = icmp ne <4 x i32> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2i64_icmp_ne(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2i64_icmp_ne:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a2, 0
++; CHECK-NEXT:    vld $vr1, $a1, 0
++; CHECK-NEXT:    vseq.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vrepli.b $vr1, -1
++; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %cmp = icmp ne <2 x i64> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETGE
++define void @v16i8_icmp_sge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i8_icmp_sge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vsle.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %cmp = icmp sge <16 x i8> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i8>
++  store <16 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v8i16_icmp_sge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i16_icmp_sge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vsle.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %cmp = icmp sge <8 x i16> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i16>
++  store <8 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v4i32_icmp_sge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i32_icmp_sge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vsle.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %cmp = icmp sge <4 x i32> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2i64_icmp_sge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2i64_icmp_sge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vsle.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %cmp = icmp sge <2 x i64> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETUGE
++define void @v16i8_icmp_uge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i8_icmp_uge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vsle.bu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %cmp = icmp uge <16 x i8> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i8>
++  store <16 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v8i16_icmp_uge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i16_icmp_uge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vsle.hu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %cmp = icmp uge <8 x i16> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i16>
++  store <8 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v4i32_icmp_uge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i32_icmp_uge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vsle.wu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %cmp = icmp uge <4 x i32> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2i64_icmp_uge(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2i64_icmp_uge:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vsle.du $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %cmp = icmp uge <2 x i64> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETGT
++define void @v16i8_icmp_sgt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i8_icmp_sgt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vslt.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %cmp = icmp sgt <16 x i8> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i8>
++  store <16 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v8i16_icmp_sgt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i16_icmp_sgt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vslt.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %cmp = icmp sgt <8 x i16> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i16>
++  store <8 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v4i32_icmp_sgt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i32_icmp_sgt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vslt.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %cmp = icmp sgt <4 x i32> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2i64_icmp_sgt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2i64_icmp_sgt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vslt.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %cmp = icmp sgt <2 x i64> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
++
++;; Expand SETUGT
++define void @v16i8_icmp_ugt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v16i8_icmp_ugt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vslt.bu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <16 x i8>, ptr %a0
++  %v1 = load <16 x i8>, ptr %a1
++  %cmp = icmp ugt <16 x i8> %v0, %v1
++  %ext = sext <16 x i1> %cmp to <16 x i8>
++  store <16 x i8> %ext, ptr %res
++  ret void
++}
++
++define void @v8i16_icmp_ugt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v8i16_icmp_ugt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vslt.hu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i16>, ptr %a0
++  %v1 = load <8 x i16>, ptr %a1
++  %cmp = icmp ugt <8 x i16> %v0, %v1
++  %ext = sext <8 x i1> %cmp to <8 x i16>
++  store <8 x i16> %ext, ptr %res
++  ret void
++}
++
++define void @v4i32_icmp_ugt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v4i32_icmp_ugt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vslt.wu $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %a0
++  %v1 = load <4 x i32>, ptr %a1
++  %cmp = icmp ugt <4 x i32> %v0, %v1
++  %ext = sext <4 x i1> %cmp to <4 x i32>
++  store <4 x i32> %ext, ptr %res
++  ret void
++}
++
++define void @v2i64_icmp_ugt(ptr %res, ptr %a0, ptr %a1) nounwind {
++; CHECK-LABEL: v2i64_icmp_ugt:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vld $vr1, $a2, 0
++; CHECK-NEXT:    vslt.du $vr0, $vr1, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %a0
++  %v1 = load <2 x i64>, ptr %a1
++  %cmp = icmp ugt <2 x i64> %v0, %v1
++  %ext = sext <2 x i1> %cmp to <2 x i64>
++  store <2 x i64> %ext, ptr %res
++  ret void
++}
+-- 
+2.20.1
+
+
+From 49444f4fbca6681e0fd404a19b562ccfcc140879 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Fri, 8 Dec 2023 14:16:26 +0800
+Subject: [PATCH 24/35] [LoongArch] Make ISD::FSQRT a legal operation with
+ lsx/lasx feature (#74795)
+
+And add some patterns:
+1. (fdiv 1.0, vector)
+2. (fdiv 1.0, (fsqrt vector))
+
+(cherry picked from commit 9f70e708a7d3fce97d63b626520351501455fca0)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       |  2 +
+ .../LoongArch/LoongArchLASXInstrInfo.td       | 22 +++++++
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td | 45 +++++++++++++
+ llvm/test/CodeGen/LoongArch/lasx/fsqrt.ll     | 65 +++++++++++++++++++
+ .../LoongArch/lasx/ir-instruction/fdiv.ll     | 29 +++++++++
+ llvm/test/CodeGen/LoongArch/lsx/fsqrt.ll      | 65 +++++++++++++++++++
+ .../LoongArch/lsx/ir-instruction/fdiv.ll      | 29 +++++++++
+ 7 files changed, 257 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/fsqrt.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/fsqrt.ll
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index 3d5ae6d3deda..8c54c7cf2cab 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -260,6 +260,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
+       setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal);
+       setOperationAction(ISD::FMA, VT, Legal);
++      setOperationAction(ISD::FSQRT, VT, Legal);
+       setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT,
+                          ISD::SETUGE, ISD::SETUGT},
+                         VT, Expand);
+@@ -300,6 +301,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
+       setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal);
+       setOperationAction(ISD::FMA, VT, Legal);
++      setOperationAction(ISD::FSQRT, VT, Legal);
+       setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT,
+                          ISD::SETUGE, ISD::SETUGT},
+                         VT, Expand);
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index a9bf65c6840d..55b90f4450c0 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -1092,6 +1092,13 @@ multiclass PatXr<SDPatternOperator OpNode, string Inst> {
+             (!cast<LAInst>(Inst#"_D") LASX256:$xj)>;
+ }
+ 
++multiclass PatXrF<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(v8f32 (OpNode (v8f32 LASX256:$xj))),
++            (!cast<LAInst>(Inst#"_S") LASX256:$xj)>;
++  def : Pat<(v4f64 (OpNode (v4f64 LASX256:$xj))),
++            (!cast<LAInst>(Inst#"_D") LASX256:$xj)>;
++}
++
+ multiclass PatXrXr<SDPatternOperator OpNode, string Inst> {
+   def : Pat<(OpNode (v32i8 LASX256:$xj), (v32i8 LASX256:$xk)),
+             (!cast<LAInst>(Inst#"_B") LASX256:$xj, LASX256:$xk)>;
+@@ -1448,6 +1455,21 @@ def : Pat<(fma v8f32:$xj, v8f32:$xk, v8f32:$xa),
+ def : Pat<(fma v4f64:$xj, v4f64:$xk, v4f64:$xa),
+           (XVFMADD_D v4f64:$xj, v4f64:$xk, v4f64:$xa)>;
+ 
++// XVFSQRT_{S/D}
++defm : PatXrF<fsqrt, "XVFSQRT">;
++
++// XVRECIP_{S/D}
++def : Pat<(fdiv vsplatf32_fpimm_eq_1, v8f32:$xj),
++          (XVFRECIP_S v8f32:$xj)>;
++def : Pat<(fdiv vsplatf64_fpimm_eq_1, v4f64:$xj),
++          (XVFRECIP_D v4f64:$xj)>;
++
++// XVFRSQRT_{S/D}
++def : Pat<(fdiv vsplatf32_fpimm_eq_1, (fsqrt v8f32:$xj)),
++          (XVFRSQRT_S v8f32:$xj)>;
++def : Pat<(fdiv vsplatf64_fpimm_eq_1, (fsqrt v4f64:$xj)),
++          (XVFRSQRT_D v4f64:$xj)>;
++
+ // XVSEQ[I]_{B/H/W/D}
+ defm : PatCCXrSimm5<SETEQ, "XVSEQI">;
+ defm : PatCCXrXr<SETEQ, "XVSEQ">;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index ff21c6681271..8ad0c5904f25 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -95,6 +95,29 @@ def vsplati64_imm_eq_63 : PatFrags<(ops), [(build_vector),
+          Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 63;
+ }]>;
+ 
++def vsplatf32_fpimm_eq_1
++  : PatFrags<(ops), [(bitconvert (v4i32 (build_vector))),
++                     (bitconvert (v8i32 (build_vector)))], [{
++  APInt Imm;
++  EVT EltTy = N->getValueType(0).getVectorElementType();
++  N = N->getOperand(0).getNode();
++
++  return selectVSplat(N, Imm, EltTy.getSizeInBits()) &&
++         Imm.getBitWidth() == EltTy.getSizeInBits() &&
++         Imm == APFloat(+1.0f).bitcastToAPInt();
++}]>;
++def vsplatf64_fpimm_eq_1
++  : PatFrags<(ops), [(bitconvert (v2i64 (build_vector))),
++                     (bitconvert (v4i64 (build_vector)))], [{
++  APInt Imm;
++  EVT EltTy = N->getValueType(0).getVectorElementType();
++  N = N->getOperand(0).getNode();
++
++  return selectVSplat(N, Imm, EltTy.getSizeInBits()) &&
++         Imm.getBitWidth() == EltTy.getSizeInBits() &&
++         Imm == APFloat(+1.0).bitcastToAPInt();
++}]>;
++
+ def vsplati8imm7   : PatFrag<(ops node:$reg),
+                              (and node:$reg, vsplati8_imm_eq_7)>;
+ def vsplati16imm15 : PatFrag<(ops node:$reg),
+@@ -1173,6 +1196,13 @@ multiclass PatVr<SDPatternOperator OpNode, string Inst> {
+             (!cast<LAInst>(Inst#"_D") LSX128:$vj)>;
+ }
+ 
++multiclass PatVrF<SDPatternOperator OpNode, string Inst> {
++  def : Pat<(v4f32 (OpNode (v4f32 LSX128:$vj))),
++            (!cast<LAInst>(Inst#"_S") LSX128:$vj)>;
++  def : Pat<(v2f64 (OpNode (v2f64 LSX128:$vj))),
++            (!cast<LAInst>(Inst#"_D") LSX128:$vj)>;
++}
++
+ multiclass PatVrVr<SDPatternOperator OpNode, string Inst> {
+   def : Pat<(OpNode (v16i8 LSX128:$vj), (v16i8 LSX128:$vk)),
+             (!cast<LAInst>(Inst#"_B") LSX128:$vj, LSX128:$vk)>;
+@@ -1525,6 +1555,21 @@ def : Pat<(fma v4f32:$vj, v4f32:$vk, v4f32:$va),
+ def : Pat<(fma v2f64:$vj, v2f64:$vk, v2f64:$va),
+           (VFMADD_D v2f64:$vj, v2f64:$vk, v2f64:$va)>;
+ 
++// VFSQRT_{S/D}
++defm : PatVrF<fsqrt, "VFSQRT">;
++
++// VFRECIP_{S/D}
++def : Pat<(fdiv vsplatf32_fpimm_eq_1, v4f32:$vj),
++          (VFRECIP_S v4f32:$vj)>;
++def : Pat<(fdiv vsplatf64_fpimm_eq_1, v2f64:$vj),
++          (VFRECIP_D v2f64:$vj)>;
++
++// VFRSQRT_{S/D}
++def : Pat<(fdiv vsplatf32_fpimm_eq_1, (fsqrt v4f32:$vj)),
++          (VFRSQRT_S v4f32:$vj)>;
++def : Pat<(fdiv vsplatf64_fpimm_eq_1, (fsqrt v2f64:$vj)),
++          (VFRSQRT_D v2f64:$vj)>;
++
+ // VSEQ[I]_{B/H/W/D}
+ defm : PatCCVrSimm5<SETEQ, "VSEQI">;
+ defm : PatCCVrVr<SETEQ, "VSEQ">;
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/fsqrt.ll b/llvm/test/CodeGen/LoongArch/lasx/fsqrt.ll
+new file mode 100644
+index 000000000000..c4a881bdeae9
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/fsqrt.ll
+@@ -0,0 +1,65 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++;; fsqrt
++define void @sqrt_v8f32(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sqrt_v8f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvfsqrt.s $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0, align 16
++  %sqrt = call <8 x float> @llvm.sqrt.v8f32 (<8 x float> %v0)
++  store <8 x float> %sqrt, ptr %res, align 16
++  ret void
++}
++
++define void @sqrt_v4f64(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sqrt_v4f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvfsqrt.d $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0, align 16
++  %sqrt = call <4 x double> @llvm.sqrt.v4f64 (<4 x double> %v0)
++  store <4 x double> %sqrt, ptr %res, align 16
++  ret void
++}
++
++;; 1.0 / (fsqrt vec)
++define void @one_div_sqrt_v8f32(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: one_div_sqrt_v8f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvfrsqrt.s $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0, align 16
++  %sqrt = call <8 x float> @llvm.sqrt.v8f32 (<8 x float> %v0)
++  %div = fdiv <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
++  store <8 x float> %div, ptr %res, align 16
++  ret void
++}
++
++define void @one_div_sqrt_v4f64(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: one_div_sqrt_v4f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvfrsqrt.d $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0, align 16
++  %sqrt = call <4 x double> @llvm.sqrt.v4f64 (<4 x double> %v0)
++  %div = fdiv <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %sqrt
++  store <4 x double> %div, ptr %res, align 16
++  ret void
++}
++
++declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
++declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fdiv.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fdiv.ll
+index 284121a79a49..6004565b0b78 100644
+--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fdiv.ll
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fdiv.ll
+@@ -32,3 +32,32 @@ entry:
+   store <4 x double> %v2, ptr %res
+   ret void
+ }
++
++;; 1.0 / vec
++define void @one_fdiv_v8f32(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: one_fdiv_v8f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvfrecip.s $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %div = fdiv <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %v0
++  store <8 x float> %div, ptr %res
++  ret void
++}
++
++define void @one_fdiv_v4f64(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: one_fdiv_v4f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvfrecip.d $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %div = fdiv <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %v0
++  store <4 x double> %div, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/fsqrt.ll b/llvm/test/CodeGen/LoongArch/lsx/fsqrt.ll
+new file mode 100644
+index 000000000000..a57bc1ca0e94
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/fsqrt.ll
+@@ -0,0 +1,65 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++;; fsqrt
++define void @sqrt_v4f32(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sqrt_v4f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vfsqrt.s $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0, align 16
++  %sqrt = call <4 x float> @llvm.sqrt.v4f32 (<4 x float> %v0)
++  store <4 x float> %sqrt, ptr %res, align 16
++  ret void
++}
++
++define void @sqrt_v2f64(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: sqrt_v2f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vfsqrt.d $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0, align 16
++  %sqrt = call <2 x double> @llvm.sqrt.v2f64 (<2 x double> %v0)
++  store <2 x double> %sqrt, ptr %res, align 16
++  ret void
++}
++
++;; 1.0 / (fsqrt vec)
++define void @one_div_sqrt_v4f32(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: one_div_sqrt_v4f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vfrsqrt.s $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0, align 16
++  %sqrt = call <4 x float> @llvm.sqrt.v4f32 (<4 x float> %v0)
++  %div = fdiv <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
++  store <4 x float> %div, ptr %res, align 16
++  ret void
++}
++
++define void @one_div_sqrt_v2f64(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: one_div_sqrt_v2f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vfrsqrt.d $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0, align 16
++  %sqrt = call <2 x double> @llvm.sqrt.v2f64 (<2 x double> %v0)
++  %div = fdiv <2 x double> <double 1.0, double 1.0>, %sqrt
++  store <2 x double> %div, ptr %res, align 16
++  ret void
++}
++
++declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
++declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fdiv.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fdiv.ll
+index eb7c8bd9616e..5f1ee9e4d212 100644
+--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fdiv.ll
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fdiv.ll
+@@ -32,3 +32,32 @@ entry:
+   store <2 x double> %v2, ptr %res
+   ret void
+ }
++
++;; 1.0 / vec
++define void @one_fdiv_v4f32(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: one_fdiv_v4f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vfrecip.s $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %div = fdiv <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %v0
++  store <4 x float> %div, ptr %res
++  ret void
++}
++
++define void @one_fdiv_v2f64(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: one_fdiv_v2f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vfrecip.d $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %div = fdiv <2 x double> <double 1.0, double 1.0>, %v0
++  store <2 x double> %div, ptr %res
++  ret void
++}
+-- 
+2.20.1
+
+
+From 5942b745b9680284decadd33d2242ffd3d2d61c0 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Fri, 8 Dec 2023 14:21:10 +0800
+Subject: [PATCH 25/35] [LoongArch] Mark ISD::FNEG as legal
+
+(cherry picked from commit cdc37325669c0321328a7245083c427b229e79e9)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       |  2 ++
+ .../LoongArch/LoongArchLASXInstrInfo.td       |  4 +++
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td |  4 +++
+ .../LoongArch/lasx/ir-instruction/fneg.ll     | 29 +++++++++++++++++++
+ .../LoongArch/lsx/ir-instruction/fneg.ll      | 29 +++++++++++++++++++
+ 5 files changed, 68 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fneg.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fneg.ll
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index 8c54c7cf2cab..c7f4b1d24f07 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -261,6 +261,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal);
+       setOperationAction(ISD::FMA, VT, Legal);
+       setOperationAction(ISD::FSQRT, VT, Legal);
++      setOperationAction(ISD::FNEG, VT, Legal);
+       setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT,
+                          ISD::SETUGE, ISD::SETUGT},
+                         VT, Expand);
+@@ -302,6 +303,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal);
+       setOperationAction(ISD::FMA, VT, Legal);
+       setOperationAction(ISD::FSQRT, VT, Legal);
++      setOperationAction(ISD::FNEG, VT, Legal);
+       setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT,
+                          ISD::SETUGE, ISD::SETUGT},
+                         VT, Expand);
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index 55b90f4450c0..8559baa0e525 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -1605,6 +1605,10 @@ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in
+   def  : Pat<(vt (vselect LASX256:$xa, LASX256:$xk, LASX256:$xj)),
+              (XVBITSEL_V LASX256:$xj, LASX256:$xk, LASX256:$xa)>;
+ 
++// fneg
++def : Pat<(fneg (v8f32 LASX256:$xj)), (XVBITREVI_W LASX256:$xj, 31)>;
++def : Pat<(fneg (v4f64 LASX256:$xj)), (XVBITREVI_D LASX256:$xj, 63)>;
++
+ } // Predicates = [HasExtLASX]
+ 
+ /// Intrinsic pattern
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index 8ad0c5904f25..5947f241bb59 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -1712,6 +1712,10 @@ foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in
+   def  : Pat<(vt (vselect LSX128:$va, LSX128:$vk, LSX128:$vj)),
+              (VBITSEL_V LSX128:$vj, LSX128:$vk, LSX128:$va)>;
+ 
++// fneg
++def : Pat<(fneg (v4f32 LSX128:$vj)), (VBITREVI_W LSX128:$vj, 31)>;
++def : Pat<(fneg (v2f64 LSX128:$vj)), (VBITREVI_D LSX128:$vj, 63)>;
++
+ } // Predicates = [HasExtLSX]
+ 
+ /// Intrinsic pattern
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fneg.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fneg.ll
+new file mode 100644
+index 000000000000..5eb468fc55a0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fneg.ll
+@@ -0,0 +1,29 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @fneg_v8f32(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: fneg_v8f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvbitrevi.w $xr0, $xr0, 31
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = fneg <8 x float> %v0
++  store <8 x float> %v1, ptr %res
++  ret void
++}
++define void @fneg_v4f64(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: fneg_v4f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvbitrevi.d $xr0, $xr0, 63
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = fneg <4 x double> %v0
++  store <4 x double> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fneg.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fneg.ll
+new file mode 100644
+index 000000000000..795c1ac8b368
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fneg.ll
+@@ -0,0 +1,29 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @fneg_v4f32(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: fneg_v4f32:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vbitrevi.w $vr0, $vr0, 31
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = fneg <4 x float> %v0
++  store <4 x float> %v1, ptr %res
++  ret void
++}
++define void @fneg_v2f64(ptr %res, ptr %a0) nounwind {
++; CHECK-LABEL: fneg_v2f64:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vbitrevi.d $vr0, $vr0, 63
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = fneg <2 x double> %v0
++  store <2 x double> %v1, ptr %res
++  ret void
++}
+-- 
+2.20.1
+
+
+From b8eb506d34e303ddc42bc4e8f304a81ba320dff2 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Mon, 11 Dec 2023 10:37:22 +0800
+Subject: [PATCH 26/35] [LoongArch] Add codegen support for
+ [X]VF{MSUB/NMADD/NMSUB}.{S/D} instructions (#74819)
+
+This is similar to single and double-precision floating-point
+instructions.
+
+(cherry picked from commit af999c4be9f5643724c6f379690ecee4346b2b48)
+---
+ .../LoongArch/LoongArchLASXInstrInfo.td       |  26 +
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td |  26 +
+ llvm/test/CodeGen/LoongArch/lasx/fma-v4f64.ll | 804 ++++++++++++++++++
+ llvm/test/CodeGen/LoongArch/lasx/fma-v8f32.ll | 804 ++++++++++++++++++
+ llvm/test/CodeGen/LoongArch/lsx/fma-v2f64.ll  | 804 ++++++++++++++++++
+ llvm/test/CodeGen/LoongArch/lsx/fma-v4f32.ll  | 804 ++++++++++++++++++
+ 6 files changed, 3268 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/fma-v4f64.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/fma-v8f32.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/fma-v2f64.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/fma-v4f32.ll
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index 8559baa0e525..ec6983d0f487 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -1455,6 +1455,32 @@ def : Pat<(fma v8f32:$xj, v8f32:$xk, v8f32:$xa),
+ def : Pat<(fma v4f64:$xj, v4f64:$xk, v4f64:$xa),
+           (XVFMADD_D v4f64:$xj, v4f64:$xk, v4f64:$xa)>;
+ 
++// XVFMSUB_{S/D}
++def : Pat<(fma v8f32:$xj, v8f32:$xk, (fneg v8f32:$xa)),
++          (XVFMSUB_S v8f32:$xj, v8f32:$xk, v8f32:$xa)>;
++def : Pat<(fma v4f64:$xj, v4f64:$xk, (fneg v4f64:$xa)),
++          (XVFMSUB_D v4f64:$xj, v4f64:$xk, v4f64:$xa)>;
++
++// XVFNMADD_{S/D}
++def : Pat<(fneg (fma v8f32:$xj, v8f32:$xk, v8f32:$xa)),
++          (XVFNMADD_S v8f32:$xj, v8f32:$xk, v8f32:$xa)>;
++def : Pat<(fneg (fma v4f64:$xj, v4f64:$xk, v4f64:$xa)),
++          (XVFNMADD_D v4f64:$xj, v4f64:$xk, v4f64:$xa)>;
++def : Pat<(fma_nsz (fneg v8f32:$xj), v8f32:$xk, (fneg v8f32:$xa)),
++          (XVFNMADD_S v8f32:$xj, v8f32:$xk, v8f32:$xa)>;
++def : Pat<(fma_nsz (fneg v4f64:$xj), v4f64:$xk, (fneg v4f64:$xa)),
++          (XVFNMADD_D v4f64:$xj, v4f64:$xk, v4f64:$xa)>;
++
++// XVFNMSUB_{S/D}
++def : Pat<(fneg (fma v8f32:$xj, v8f32:$xk, (fneg v8f32:$xa))),
++          (XVFNMSUB_S v8f32:$xj, v8f32:$xk, v8f32:$xa)>;
++def : Pat<(fneg (fma v4f64:$xj, v4f64:$xk, (fneg v4f64:$xa))),
++          (XVFNMSUB_D v4f64:$xj, v4f64:$xk, v4f64:$xa)>;
++def : Pat<(fma_nsz (fneg v8f32:$xj), v8f32:$xk, v8f32:$xa),
++          (XVFNMSUB_S v8f32:$xj, v8f32:$xk, v8f32:$xa)>;
++def : Pat<(fma_nsz (fneg v4f64:$xj), v4f64:$xk, v4f64:$xa),
++          (XVFNMSUB_D v4f64:$xj, v4f64:$xk, v4f64:$xa)>;
++
+ // XVFSQRT_{S/D}
+ defm : PatXrF<fsqrt, "XVFSQRT">;
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index 5947f241bb59..e468176885d7 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -1555,6 +1555,32 @@ def : Pat<(fma v4f32:$vj, v4f32:$vk, v4f32:$va),
+ def : Pat<(fma v2f64:$vj, v2f64:$vk, v2f64:$va),
+           (VFMADD_D v2f64:$vj, v2f64:$vk, v2f64:$va)>;
+ 
++// VFMSUB_{S/D}
++def : Pat<(fma v4f32:$vj, v4f32:$vk, (fneg v4f32:$va)),
++          (VFMSUB_S v4f32:$vj, v4f32:$vk, v4f32:$va)>;
++def : Pat<(fma v2f64:$vj, v2f64:$vk, (fneg v2f64:$va)),
++          (VFMSUB_D v2f64:$vj, v2f64:$vk, v2f64:$va)>;
++
++// VFNMADD_{S/D}
++def : Pat<(fneg (fma v4f32:$vj, v4f32:$vk, v4f32:$va)),
++          (VFNMADD_S v4f32:$vj, v4f32:$vk, v4f32:$va)>;
++def : Pat<(fneg (fma v2f64:$vj, v2f64:$vk, v2f64:$va)),
++          (VFNMADD_D v2f64:$vj, v2f64:$vk, v2f64:$va)>;
++def : Pat<(fma_nsz (fneg v4f32:$vj), v4f32:$vk, (fneg v4f32:$va)),
++          (VFNMADD_S v4f32:$vj, v4f32:$vk, v4f32:$va)>;
++def : Pat<(fma_nsz (fneg v2f64:$vj), v2f64:$vk, (fneg v2f64:$va)),
++          (VFNMADD_D v2f64:$vj, v2f64:$vk, v2f64:$va)>;
++
++// VFNMSUB_{S/D}
++def : Pat<(fneg (fma v4f32:$vj, v4f32:$vk, (fneg v4f32:$va))),
++          (VFNMSUB_S v4f32:$vj, v4f32:$vk, v4f32:$va)>;
++def : Pat<(fneg (fma v2f64:$vj, v2f64:$vk, (fneg v2f64:$va))),
++          (VFNMSUB_D v2f64:$vj, v2f64:$vk, v2f64:$va)>;
++def : Pat<(fma_nsz (fneg v4f32:$vj), v4f32:$vk, v4f32:$va),
++          (VFNMSUB_S v4f32:$vj, v4f32:$vk, v4f32:$va)>;
++def : Pat<(fma_nsz (fneg v2f64:$vj), v2f64:$vk, v2f64:$va),
++          (VFNMSUB_D v2f64:$vj, v2f64:$vk, v2f64:$va)>;
++
+ // VFSQRT_{S/D}
+ defm : PatVrF<fsqrt, "VFSQRT">;
+ 
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/fma-v4f64.ll b/llvm/test/CodeGen/LoongArch/lasx/fma-v4f64.ll
+new file mode 100644
+index 000000000000..af18c52b096c
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/fma-v4f64.ll
+@@ -0,0 +1,804 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx --fp-contract=fast < %s \
++; RUN:   | FileCheck %s --check-prefix=CONTRACT-FAST
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx --fp-contract=on < %s \
++; RUN:   | FileCheck %s --check-prefix=CONTRACT-ON
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx --fp-contract=off < %s \
++; RUN:   | FileCheck %s --check-prefix=CONTRACT-OFF
++
++define void @xvfmadd_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfmadd_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfmadd_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmul.d $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-ON-NEXT:    xvfadd.d $xr0, $xr0, $xr1
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfmadd_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmul.d $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-OFF-NEXT:    xvfadd.d $xr0, $xr0, $xr1
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %mul = fmul<4 x double> %v0, %v1
++  %add = fadd<4 x double> %mul, %v2
++  store <4 x double> %add, ptr %res
++  ret void
++}
++
++define void @xvfmsub_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfmsub_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfmsub_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmul.d $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-ON-NEXT:    xvfsub.d $xr0, $xr0, $xr1
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfmsub_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmul.d $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-OFF-NEXT:    xvfsub.d $xr0, $xr0, $xr1
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %mul = fmul<4 x double> %v0, %v1
++  %sub = fsub<4 x double> %mul, %v2
++  store <4 x double> %sub, ptr %res
++  ret void
++}
++
++define void @xvfnmadd_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfnmadd_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfnmadd_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmul.d $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-ON-NEXT:    xvfadd.d $xr0, $xr0, $xr1
++; CONTRACT-ON-NEXT:    xvbitrevi.d $xr0, $xr0, 63
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfnmadd_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmul.d $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-OFF-NEXT:    xvfadd.d $xr0, $xr0, $xr1
++; CONTRACT-OFF-NEXT:    xvbitrevi.d $xr0, $xr0, 63
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %mul = fmul<4 x double> %v0, %v1
++  %add = fadd<4 x double> %mul, %v2
++  %negadd = fneg<4 x double> %add
++  store <4 x double> %negadd, ptr %res
++  ret void
++}
++
++define void @xvfnmadd_d_nsz(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfnmadd_d_nsz:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfnmadd_d_nsz:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-ON-NEXT:    xvbitrevi.d $xr1, $xr1, 63
++; CONTRACT-ON-NEXT:    xvfmul.d $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-ON-NEXT:    xvfsub.d $xr0, $xr0, $xr1
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfnmadd_d_nsz:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-OFF-NEXT:    xvbitrevi.d $xr1, $xr1, 63
++; CONTRACT-OFF-NEXT:    xvfmul.d $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-OFF-NEXT:    xvfsub.d $xr0, $xr0, $xr1
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %negv0 = fneg nsz<4 x double> %v0
++  %negv2 = fneg nsz<4 x double> %v2
++  %mul = fmul nsz<4 x double> %negv0, %v1
++  %add = fadd nsz<4 x double> %mul, %negv2
++  store <4 x double> %add, ptr %res
++  ret void
++}
++
++;; Check that xvfnmadd.d is not emitted.
++define void @not_xvfnmadd_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: not_xvfnmadd_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvbitrevi.d $xr2, $xr2, 63
++; CONTRACT-FAST-NEXT:    xvfmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: not_xvfnmadd_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-ON-NEXT:    xvbitrevi.d $xr1, $xr1, 63
++; CONTRACT-ON-NEXT:    xvfmul.d $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-ON-NEXT:    xvfsub.d $xr0, $xr0, $xr1
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: not_xvfnmadd_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-OFF-NEXT:    xvbitrevi.d $xr1, $xr1, 63
++; CONTRACT-OFF-NEXT:    xvfmul.d $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-OFF-NEXT:    xvfsub.d $xr0, $xr0, $xr1
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %negv0 = fneg<4 x double> %v0
++  %negv2 = fneg<4 x double> %v2
++  %mul = fmul<4 x double> %negv0, %v1
++  %add = fadd<4 x double> %mul, %negv2
++  store <4 x double> %add, ptr %res
++  ret void
++}
++
++define void @xvfnmsub_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfnmsub_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfnmsub_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmul.d $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-ON-NEXT:    xvfsub.d $xr0, $xr0, $xr1
++; CONTRACT-ON-NEXT:    xvbitrevi.d $xr0, $xr0, 63
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfnmsub_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmul.d $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-OFF-NEXT:    xvfsub.d $xr0, $xr0, $xr1
++; CONTRACT-OFF-NEXT:    xvbitrevi.d $xr0, $xr0, 63
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %negv2 = fneg<4 x double> %v2
++  %mul = fmul<4 x double> %v0, %v1
++  %add = fadd<4 x double> %mul, %negv2
++  %neg = fneg<4 x double> %add
++  store <4 x double> %neg, ptr %res
++  ret void
++}
++
++define void @xvfnmsub_d_nsz(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfnmsub_d_nsz:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfnmsub_d_nsz:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmul.d $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-ON-NEXT:    xvfsub.d $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfnmsub_d_nsz:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmul.d $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-OFF-NEXT:    xvfsub.d $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %negv0 = fneg nsz<4 x double> %v0
++  %mul = fmul nsz<4 x double> %negv0, %v1
++  %add = fadd nsz<4 x double> %mul, %v2
++  store <4 x double> %add, ptr %res
++  ret void
++}
++
++;; Check that xvfnmsub.d is not emitted.
++define void @not_xvfnmsub_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: not_xvfnmsub_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvbitrevi.d $xr2, $xr2, 63
++; CONTRACT-FAST-NEXT:    xvfmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: not_xvfnmsub_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmul.d $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-ON-NEXT:    xvfsub.d $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: not_xvfnmsub_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmul.d $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-OFF-NEXT:    xvfsub.d $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %negv0 = fneg<4 x double> %v0
++  %mul = fmul<4 x double> %negv0, %v1
++  %add = fadd<4 x double> %mul, %v2
++  store <4 x double> %add, ptr %res
++  ret void
++}
++
++define void @contract_xvfmadd_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_xvfmadd_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_xvfmadd_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_xvfmadd_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %mul = fmul contract <4 x double> %v0, %v1
++  %add = fadd contract <4 x double> %mul, %v2
++  store <4 x double> %add, ptr %res
++  ret void
++}
++
++define void @contract_xvfmsub_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_xvfmsub_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_xvfmsub_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_xvfmsub_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %mul = fmul contract <4 x double> %v0, %v1
++  %sub = fsub contract <4 x double> %mul, %v2
++  store <4 x double> %sub, ptr %res
++  ret void
++}
++
++define void @contract_xvfnmadd_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_xvfnmadd_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_xvfnmadd_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfnmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_xvfnmadd_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfnmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %mul = fmul contract <4 x double> %v0, %v1
++  %add = fadd contract <4 x double> %mul, %v2
++  %negadd = fneg contract <4 x double> %add
++  store <4 x double> %negadd, ptr %res
++  ret void
++}
++
++define void @contract_xvfnmadd_d_nsz(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_xvfnmadd_d_nsz:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_xvfnmadd_d_nsz:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfnmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_xvfnmadd_d_nsz:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfnmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %negv0 = fneg contract nsz<4 x double> %v0
++  %negv2 = fneg contract nsz<4 x double> %v2
++  %mul = fmul contract nsz<4 x double> %negv0, %v1
++  %add = fadd contract nsz<4 x double> %mul, %negv2
++  store <4 x double> %add, ptr %res
++  ret void
++}
++
++;; Check that xvfnmadd.d is not emitted.
++define void @not_contract_xvfnmadd_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: not_contract_xvfnmadd_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvbitrevi.d $xr2, $xr2, 63
++; CONTRACT-FAST-NEXT:    xvfmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: not_contract_xvfnmadd_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvbitrevi.d $xr2, $xr2, 63
++; CONTRACT-ON-NEXT:    xvfmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: not_contract_xvfnmadd_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvbitrevi.d $xr2, $xr2, 63
++; CONTRACT-OFF-NEXT:    xvfmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %negv0 = fneg contract <4 x double> %v0
++  %negv2 = fneg contract <4 x double> %v2
++  %mul = fmul contract <4 x double> %negv0, %v1
++  %add = fadd contract <4 x double> %mul, %negv2
++  store <4 x double> %add, ptr %res
++  ret void
++}
++
++define void @contract_xvfnmsub_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_xvfnmsub_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_xvfnmsub_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfnmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_xvfnmsub_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfnmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %negv2 = fneg contract <4 x double> %v2
++  %mul = fmul contract <4 x double> %v0, %v1
++  %add = fadd contract <4 x double> %mul, %negv2
++  %neg = fneg contract <4 x double> %add
++  store <4 x double> %neg, ptr %res
++  ret void
++}
++
++define void @contract_xvfnmsub_d_nsz(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_xvfnmsub_d_nsz:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_xvfnmsub_d_nsz:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfnmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_xvfnmsub_d_nsz:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfnmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %negv0 = fneg contract nsz<4 x double> %v0
++  %mul = fmul contract nsz<4 x double> %negv0, %v1
++  %add = fadd contract nsz<4 x double> %mul, %v2
++  store <4 x double> %add, ptr %res
++  ret void
++}
++
++;; Check that xvfnmsub.d is not emitted.
++define void @not_contract_xvfnmsub_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: not_contract_xvfnmsub_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvbitrevi.d $xr2, $xr2, 63
++; CONTRACT-FAST-NEXT:    xvfmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: not_contract_xvfnmsub_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvbitrevi.d $xr2, $xr2, 63
++; CONTRACT-ON-NEXT:    xvfmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: not_contract_xvfnmsub_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvbitrevi.d $xr2, $xr2, 63
++; CONTRACT-OFF-NEXT:    xvfmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %negv0 = fneg contract <4 x double> %v0
++  %mul = fmul contract <4 x double> %negv0, %v1
++  %add = fadd contract <4 x double> %mul, %v2
++  store <4 x double> %add, ptr %res
++  ret void
++}
++
++define void @xvfmadd_d_contract(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfmadd_d_contract:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfmadd_d_contract:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfmadd_d_contract:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %mul = fmul contract <4 x double> %v0, %v1
++  %add = fadd contract <4 x double> %mul, %v2
++  store <4 x double> %add, ptr %res
++  ret void
++}
++
++define void @xvfmsub_d_contract(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfmsub_d_contract:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfmsub_d_contract:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfmsub_d_contract:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %mul = fmul contract <4 x double> %v0, %v1
++  %sub = fsub contract <4 x double> %mul, %v2
++  store <4 x double> %sub, ptr %res
++  ret void
++}
++
++define void @xvfnmadd_d_contract(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfnmadd_d_contract:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfnmadd_d_contract:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfnmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfnmadd_d_contract:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfnmadd.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %mul = fmul contract <4 x double> %v0, %v1
++  %add = fadd contract <4 x double> %mul, %v2
++  %negadd = fneg contract <4 x double> %add
++  store <4 x double> %negadd, ptr %res
++  ret void
++}
++
++define void @xvfnmsub_d_contract(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfnmsub_d_contract:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfnmsub_d_contract:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfnmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfnmsub_d_contract:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfnmsub.d $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x double>, ptr %a0
++  %v1 = load <4 x double>, ptr %a1
++  %v2 = load <4 x double>, ptr %a2
++  %mul = fmul contract <4 x double> %v0, %v1
++  %negv2 = fneg contract <4 x double> %v2
++  %add = fadd contract <4 x double> %negv2, %mul
++  %negadd = fneg contract <4 x double> %add
++  store <4 x double> %negadd, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/fma-v8f32.ll b/llvm/test/CodeGen/LoongArch/lasx/fma-v8f32.ll
+new file mode 100644
+index 000000000000..b7b3cb3a2e66
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/fma-v8f32.ll
+@@ -0,0 +1,804 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx --fp-contract=fast < %s \
++; RUN:   | FileCheck %s --check-prefix=CONTRACT-FAST
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx --fp-contract=on < %s \
++; RUN:   | FileCheck %s --check-prefix=CONTRACT-ON
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx --fp-contract=off < %s \
++; RUN:   | FileCheck %s --check-prefix=CONTRACT-OFF
++
++define void @xvfmadd_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfmadd_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfmadd_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmul.s $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-ON-NEXT:    xvfadd.s $xr0, $xr0, $xr1
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfmadd_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmul.s $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-OFF-NEXT:    xvfadd.s $xr0, $xr0, $xr1
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %mul = fmul<8 x float> %v0, %v1
++  %add = fadd<8 x float> %mul, %v2
++  store <8 x float> %add, ptr %res
++  ret void
++}
++
++define void @xvfmsub_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfmsub_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfmsub_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmul.s $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-ON-NEXT:    xvfsub.s $xr0, $xr0, $xr1
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfmsub_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmul.s $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-OFF-NEXT:    xvfsub.s $xr0, $xr0, $xr1
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %mul = fmul<8 x float> %v0, %v1
++  %sub = fsub<8 x float> %mul, %v2
++  store <8 x float> %sub, ptr %res
++  ret void
++}
++
++define void @xvfnmadd_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfnmadd_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfnmadd_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmul.s $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-ON-NEXT:    xvfadd.s $xr0, $xr0, $xr1
++; CONTRACT-ON-NEXT:    xvbitrevi.w $xr0, $xr0, 31
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfnmadd_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmul.s $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-OFF-NEXT:    xvfadd.s $xr0, $xr0, $xr1
++; CONTRACT-OFF-NEXT:    xvbitrevi.w $xr0, $xr0, 31
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %mul = fmul<8 x float> %v0, %v1
++  %add = fadd<8 x float> %mul, %v2
++  %negadd = fneg<8 x float> %add
++  store <8 x float> %negadd, ptr %res
++  ret void
++}
++
++define void @xvfnmadd_s_nsz(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfnmadd_s_nsz:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfnmadd_s_nsz:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-ON-NEXT:    xvbitrevi.w $xr1, $xr1, 31
++; CONTRACT-ON-NEXT:    xvfmul.s $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-ON-NEXT:    xvfsub.s $xr0, $xr0, $xr1
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfnmadd_s_nsz:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-OFF-NEXT:    xvbitrevi.w $xr1, $xr1, 31
++; CONTRACT-OFF-NEXT:    xvfmul.s $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-OFF-NEXT:    xvfsub.s $xr0, $xr0, $xr1
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %negv0 = fneg nsz<8 x float> %v0
++  %negv2 = fneg nsz<8 x float> %v2
++  %mul = fmul nsz<8 x float> %negv0, %v1
++  %add = fadd nsz<8 x float> %mul, %negv2
++  store <8 x float> %add, ptr %res
++  ret void
++}
++
++;; Check that fnmadd.s is not emitted.
++define void @not_xvfnmadd_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: not_xvfnmadd_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvbitrevi.w $xr2, $xr2, 31
++; CONTRACT-FAST-NEXT:    xvfmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: not_xvfnmadd_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-ON-NEXT:    xvbitrevi.w $xr1, $xr1, 31
++; CONTRACT-ON-NEXT:    xvfmul.s $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-ON-NEXT:    xvfsub.s $xr0, $xr0, $xr1
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: not_xvfnmadd_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-OFF-NEXT:    xvbitrevi.w $xr1, $xr1, 31
++; CONTRACT-OFF-NEXT:    xvfmul.s $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-OFF-NEXT:    xvfsub.s $xr0, $xr0, $xr1
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %negv0 = fneg<8 x float> %v0
++  %negv2 = fneg<8 x float> %v2
++  %mul = fmul<8 x float> %negv0, %v1
++  %add = fadd<8 x float> %mul, %negv2
++  store <8 x float> %add, ptr %res
++  ret void
++}
++
++define void @xvfnmsub_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfnmsub_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfnmsub_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmul.s $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-ON-NEXT:    xvfsub.s $xr0, $xr0, $xr1
++; CONTRACT-ON-NEXT:    xvbitrevi.w $xr0, $xr0, 31
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfnmsub_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmul.s $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-OFF-NEXT:    xvfsub.s $xr0, $xr0, $xr1
++; CONTRACT-OFF-NEXT:    xvbitrevi.w $xr0, $xr0, 31
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %negv2 = fneg<8 x float> %v2
++  %mul = fmul<8 x float> %v0, %v1
++  %add = fadd<8 x float> %mul, %negv2
++  %neg = fneg<8 x float> %add
++  store <8 x float> %neg, ptr %res
++  ret void
++}
++
++define void @xvfnmsub_s_nsz(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfnmsub_s_nsz:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfnmsub_s_nsz:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmul.s $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-ON-NEXT:    xvfsub.s $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfnmsub_s_nsz:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmul.s $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-OFF-NEXT:    xvfsub.s $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %negv0 = fneg nsz<8 x float> %v0
++  %mul = fmul nsz<8 x float> %negv0, %v1
++  %add = fadd nsz<8 x float> %mul, %v2
++  store <8 x float> %add, ptr %res
++  ret void
++}
++
++;; Check that fnmsub.s is not emitted.
++define void @not_xvfnmsub_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: not_xvfnmsub_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvbitrevi.w $xr2, $xr2, 31
++; CONTRACT-FAST-NEXT:    xvfmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: not_xvfnmsub_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmul.s $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-ON-NEXT:    xvfsub.s $xr0, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: not_xvfnmsub_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmul.s $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a3, 0
++; CONTRACT-OFF-NEXT:    xvfsub.s $xr0, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %negv0 = fneg<8 x float> %v0
++  %mul = fmul<8 x float> %negv0, %v1
++  %add = fadd<8 x float> %mul, %v2
++  store <8 x float> %add, ptr %res
++  ret void
++}
++
++define void @contract_xvfmadd_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_xvfmadd_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_xvfmadd_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_xvfmadd_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %mul = fmul contract <8 x float> %v0, %v1
++  %add = fadd contract <8 x float> %mul, %v2
++  store <8 x float> %add, ptr %res
++  ret void
++}
++
++define void @contract_xvfmsub_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_xvfmsub_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_xvfmsub_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_xvfmsub_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %mul = fmul contract <8 x float> %v0, %v1
++  %sub = fsub contract <8 x float> %mul, %v2
++  store <8 x float> %sub, ptr %res
++  ret void
++}
++
++define void @contract_xvfnmadd_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_xvfnmadd_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_xvfnmadd_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfnmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_xvfnmadd_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfnmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %mul = fmul contract <8 x float> %v0, %v1
++  %add = fadd contract <8 x float> %mul, %v2
++  %negadd = fneg contract <8 x float> %add
++  store <8 x float> %negadd, ptr %res
++  ret void
++}
++
++define void @contract_xvfnmadd_s_nsz(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_xvfnmadd_s_nsz:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_xvfnmadd_s_nsz:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfnmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_xvfnmadd_s_nsz:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfnmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %negv0 = fneg contract nsz<8 x float> %v0
++  %negv2 = fneg contract nsz<8 x float> %v2
++  %mul = fmul contract nsz<8 x float> %negv0, %v1
++  %add = fadd contract nsz<8 x float> %mul, %negv2
++  store <8 x float> %add, ptr %res
++  ret void
++}
++
++;; Check that fnmadd.s is not emitted.
++define void @not_contract_xvfnmadd_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: not_contract_xvfnmadd_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvbitrevi.w $xr2, $xr2, 31
++; CONTRACT-FAST-NEXT:    xvfmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: not_contract_xvfnmadd_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvbitrevi.w $xr2, $xr2, 31
++; CONTRACT-ON-NEXT:    xvfmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: not_contract_xvfnmadd_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvbitrevi.w $xr2, $xr2, 31
++; CONTRACT-OFF-NEXT:    xvfmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %negv0 = fneg contract <8 x float> %v0
++  %negv2 = fneg contract <8 x float> %v2
++  %mul = fmul contract <8 x float> %negv0, %v1
++  %add = fadd contract <8 x float> %mul, %negv2
++  store <8 x float> %add, ptr %res
++  ret void
++}
++
++define void @contract_xvfnmsub_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_xvfnmsub_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_xvfnmsub_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfnmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_xvfnmsub_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfnmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %negv2 = fneg contract <8 x float> %v2
++  %mul = fmul contract <8 x float> %v0, %v1
++  %add = fadd contract <8 x float> %mul, %negv2
++  %neg = fneg contract <8 x float> %add
++  store <8 x float> %neg, ptr %res
++  ret void
++}
++
++define void @contract_xvfnmsub_s_nsz(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_xvfnmsub_s_nsz:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_xvfnmsub_s_nsz:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfnmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_xvfnmsub_s_nsz:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfnmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %negv0 = fneg contract nsz<8 x float> %v0
++  %mul = fmul contract nsz<8 x float> %negv0, %v1
++  %add = fadd contract nsz<8 x float> %mul, %v2
++  store <8 x float> %add, ptr %res
++  ret void
++}
++
++;; Check that fnmsub.s is not emitted.
++define void @not_contract_xvfnmsub_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: not_contract_xvfnmsub_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvbitrevi.w $xr2, $xr2, 31
++; CONTRACT-FAST-NEXT:    xvfmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: not_contract_xvfnmsub_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvbitrevi.w $xr2, $xr2, 31
++; CONTRACT-ON-NEXT:    xvfmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: not_contract_xvfnmsub_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvbitrevi.w $xr2, $xr2, 31
++; CONTRACT-OFF-NEXT:    xvfmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %negv0 = fneg contract <8 x float> %v0
++  %mul = fmul contract <8 x float> %negv0, %v1
++  %add = fadd contract <8 x float> %mul, %v2
++  store <8 x float> %add, ptr %res
++  ret void
++}
++
++define void @xvfmadd_s_contract(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfmadd_s_contract:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfmadd_s_contract:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfmadd_s_contract:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %mul = fmul contract <8 x float> %v0, %v1
++  %add = fadd contract <8 x float> %mul, %v2
++  store <8 x float> %add, ptr %res
++  ret void
++}
++
++define void @xvfmsub_s_contract(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfmsub_s_contract:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfmsub_s_contract:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfmsub_s_contract:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %mul = fmul contract <8 x float> %v0, %v1
++  %sub = fsub contract <8 x float> %mul, %v2
++  store <8 x float> %sub, ptr %res
++  ret void
++}
++
++define void @xvfnmadd_s_contract(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfnmadd_s_contract:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfnmadd_s_contract:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfnmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfnmadd_s_contract:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfnmadd.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %mul = fmul contract <8 x float> %v0, %v1
++  %add = fadd contract <8 x float> %mul, %v2
++  %negadd = fneg contract <8 x float> %add
++  store <8 x float> %negadd, ptr %res
++  ret void
++}
++
++define void @xvfnmsub_s_contract(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: xvfnmsub_s_contract:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-FAST-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-FAST-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-FAST-NEXT:    xvfnmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-FAST-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: xvfnmsub_s_contract:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-ON-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-ON-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-ON-NEXT:    xvfnmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-ON-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: xvfnmsub_s_contract:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    xvld $xr0, $a3, 0
++; CONTRACT-OFF-NEXT:    xvld $xr1, $a2, 0
++; CONTRACT-OFF-NEXT:    xvld $xr2, $a1, 0
++; CONTRACT-OFF-NEXT:    xvfnmsub.s $xr0, $xr2, $xr1, $xr0
++; CONTRACT-OFF-NEXT:    xvst $xr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <8 x float>, ptr %a0
++  %v1 = load <8 x float>, ptr %a1
++  %v2 = load <8 x float>, ptr %a2
++  %mul = fmul contract <8 x float> %v0, %v1
++  %negv2 = fneg contract <8 x float> %v2
++  %add = fadd contract <8 x float> %negv2, %mul
++  %negadd = fneg contract <8 x float> %add
++  store <8 x float> %negadd, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/fma-v2f64.ll b/llvm/test/CodeGen/LoongArch/lsx/fma-v2f64.ll
+new file mode 100644
+index 000000000000..8e0459b4afab
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/fma-v2f64.ll
+@@ -0,0 +1,804 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx --fp-contract=fast < %s \
++; RUN:   | FileCheck %s --check-prefix=CONTRACT-FAST
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx --fp-contract=on < %s \
++; RUN:   | FileCheck %s --check-prefix=CONTRACT-ON
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx --fp-contract=off < %s \
++; RUN:   | FileCheck %s --check-prefix=CONTRACT-OFF
++
++define void @vfmadd_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfmadd_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfmadd_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-ON-NEXT:    vfmul.d $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-ON-NEXT:    vfadd.d $vr0, $vr0, $vr1
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfmadd_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmul.d $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-OFF-NEXT:    vfadd.d $vr0, $vr0, $vr1
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %mul = fmul<2 x double> %v0, %v1
++  %add = fadd<2 x double> %mul, %v2
++  store <2 x double> %add, ptr %res
++  ret void
++}
++
++define void @vfmsub_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfmsub_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfmsub_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-ON-NEXT:    vfmul.d $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-ON-NEXT:    vfsub.d $vr0, $vr0, $vr1
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfmsub_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmul.d $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-OFF-NEXT:    vfsub.d $vr0, $vr0, $vr1
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %mul = fmul<2 x double> %v0, %v1
++  %sub = fsub<2 x double> %mul, %v2
++  store <2 x double> %sub, ptr %res
++  ret void
++}
++
++define void @vfnmadd_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfnmadd_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfnmadd_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-ON-NEXT:    vfmul.d $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-ON-NEXT:    vfadd.d $vr0, $vr0, $vr1
++; CONTRACT-ON-NEXT:    vbitrevi.d $vr0, $vr0, 63
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfnmadd_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmul.d $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-OFF-NEXT:    vfadd.d $vr0, $vr0, $vr1
++; CONTRACT-OFF-NEXT:    vbitrevi.d $vr0, $vr0, 63
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %mul = fmul<2 x double> %v0, %v1
++  %add = fadd<2 x double> %mul, %v2
++  %negadd = fneg<2 x double> %add
++  store <2 x double> %negadd, ptr %res
++  ret void
++}
++
++define void @vfnmadd_d_nsz(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfnmadd_d_nsz:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfnmadd_d_nsz:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-ON-NEXT:    vbitrevi.d $vr1, $vr1, 63
++; CONTRACT-ON-NEXT:    vfmul.d $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-ON-NEXT:    vfsub.d $vr0, $vr0, $vr1
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfnmadd_d_nsz:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-OFF-NEXT:    vbitrevi.d $vr1, $vr1, 63
++; CONTRACT-OFF-NEXT:    vfmul.d $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-OFF-NEXT:    vfsub.d $vr0, $vr0, $vr1
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %negv0 = fneg nsz<2 x double> %v0
++  %negv2 = fneg nsz<2 x double> %v2
++  %mul = fmul nsz<2 x double> %negv0, %v1
++  %add = fadd nsz<2 x double> %mul, %negv2
++  store <2 x double> %add, ptr %res
++  ret void
++}
++
++;; Check that vfnmadd.d is not emitted.
++define void @not_vfnmadd_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: not_vfnmadd_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vbitrevi.d $vr2, $vr2, 63
++; CONTRACT-FAST-NEXT:    vfmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: not_vfnmadd_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-ON-NEXT:    vbitrevi.d $vr1, $vr1, 63
++; CONTRACT-ON-NEXT:    vfmul.d $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-ON-NEXT:    vfsub.d $vr0, $vr0, $vr1
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: not_vfnmadd_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-OFF-NEXT:    vbitrevi.d $vr1, $vr1, 63
++; CONTRACT-OFF-NEXT:    vfmul.d $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-OFF-NEXT:    vfsub.d $vr0, $vr0, $vr1
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %negv0 = fneg<2 x double> %v0
++  %negv2 = fneg<2 x double> %v2
++  %mul = fmul<2 x double> %negv0, %v1
++  %add = fadd<2 x double> %mul, %negv2
++  store <2 x double> %add, ptr %res
++  ret void
++}
++
++define void @vfnmsub_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfnmsub_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfnmsub_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-ON-NEXT:    vfmul.d $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-ON-NEXT:    vfsub.d $vr0, $vr0, $vr1
++; CONTRACT-ON-NEXT:    vbitrevi.d $vr0, $vr0, 63
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfnmsub_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmul.d $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-OFF-NEXT:    vfsub.d $vr0, $vr0, $vr1
++; CONTRACT-OFF-NEXT:    vbitrevi.d $vr0, $vr0, 63
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %negv2 = fneg<2 x double> %v2
++  %mul = fmul<2 x double> %v0, %v1
++  %add = fadd<2 x double> %mul, %negv2
++  %neg = fneg<2 x double> %add
++  store <2 x double> %neg, ptr %res
++  ret void
++}
++
++define void @vfnmsub_d_nsz(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfnmsub_d_nsz:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfnmsub_d_nsz:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-ON-NEXT:    vfmul.d $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-ON-NEXT:    vfsub.d $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfnmsub_d_nsz:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmul.d $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-OFF-NEXT:    vfsub.d $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %negv0 = fneg nsz<2 x double> %v0
++  %mul = fmul nsz<2 x double> %negv0, %v1
++  %add = fadd nsz<2 x double> %mul, %v2
++  store <2 x double> %add, ptr %res
++  ret void
++}
++
++;; Check that vfnmsub.d is not emitted.
++define void @not_vfnmsub_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: not_vfnmsub_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vbitrevi.d $vr2, $vr2, 63
++; CONTRACT-FAST-NEXT:    vfmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: not_vfnmsub_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-ON-NEXT:    vfmul.d $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-ON-NEXT:    vfsub.d $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: not_vfnmsub_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmul.d $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-OFF-NEXT:    vfsub.d $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %negv0 = fneg<2 x double> %v0
++  %mul = fmul<2 x double> %negv0, %v1
++  %add = fadd<2 x double> %mul, %v2
++  store <2 x double> %add, ptr %res
++  ret void
++}
++
++define void @contract_vfmadd_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_vfmadd_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_vfmadd_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_vfmadd_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %mul = fmul contract <2 x double> %v0, %v1
++  %add = fadd contract <2 x double> %mul, %v2
++  store <2 x double> %add, ptr %res
++  ret void
++}
++
++define void @contract_vfmsub_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_vfmsub_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_vfmsub_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_vfmsub_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %mul = fmul contract <2 x double> %v0, %v1
++  %sub = fsub contract <2 x double> %mul, %v2
++  store <2 x double> %sub, ptr %res
++  ret void
++}
++
++define void @contract_vfnmadd_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_vfnmadd_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_vfnmadd_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfnmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_vfnmadd_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfnmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %mul = fmul contract <2 x double> %v0, %v1
++  %add = fadd contract <2 x double> %mul, %v2
++  %negadd = fneg contract <2 x double> %add
++  store <2 x double> %negadd, ptr %res
++  ret void
++}
++
++define void @contract_vfnmadd_d_nsz(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_vfnmadd_d_nsz:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_vfnmadd_d_nsz:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfnmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_vfnmadd_d_nsz:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfnmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %negv0 = fneg contract nsz<2 x double> %v0
++  %negv2 = fneg contract nsz<2 x double> %v2
++  %mul = fmul contract nsz<2 x double> %negv0, %v1
++  %add = fadd contract nsz<2 x double> %mul, %negv2
++  store <2 x double> %add, ptr %res
++  ret void
++}
++
++;; Check that vfnmadd.d is not emitted.
++define void @not_contract_vfnmadd_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: not_contract_vfnmadd_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vbitrevi.d $vr2, $vr2, 63
++; CONTRACT-FAST-NEXT:    vfmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: not_contract_vfnmadd_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vbitrevi.d $vr2, $vr2, 63
++; CONTRACT-ON-NEXT:    vfmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: not_contract_vfnmadd_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vbitrevi.d $vr2, $vr2, 63
++; CONTRACT-OFF-NEXT:    vfmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %negv0 = fneg contract <2 x double> %v0
++  %negv2 = fneg contract <2 x double> %v2
++  %mul = fmul contract <2 x double> %negv0, %v1
++  %add = fadd contract <2 x double> %mul, %negv2
++  store <2 x double> %add, ptr %res
++  ret void
++}
++
++define void @contract_vfnmsub_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_vfnmsub_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_vfnmsub_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfnmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_vfnmsub_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfnmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %negv2 = fneg contract <2 x double> %v2
++  %mul = fmul contract <2 x double> %v0, %v1
++  %add = fadd contract <2 x double> %mul, %negv2
++  %neg = fneg contract <2 x double> %add
++  store <2 x double> %neg, ptr %res
++  ret void
++}
++
++define void @contract_vfnmsub_d_nsz(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_vfnmsub_d_nsz:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_vfnmsub_d_nsz:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfnmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_vfnmsub_d_nsz:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfnmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %negv0 = fneg contract nsz<2 x double> %v0
++  %mul = fmul contract nsz<2 x double> %negv0, %v1
++  %add = fadd contract nsz<2 x double> %mul, %v2
++  store <2 x double> %add, ptr %res
++  ret void
++}
++
++;; Check that vfnmsub.d is not emitted.
++define void @not_contract_vfnmsub_d(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: not_contract_vfnmsub_d:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vbitrevi.d $vr2, $vr2, 63
++; CONTRACT-FAST-NEXT:    vfmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: not_contract_vfnmsub_d:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vbitrevi.d $vr2, $vr2, 63
++; CONTRACT-ON-NEXT:    vfmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: not_contract_vfnmsub_d:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vbitrevi.d $vr2, $vr2, 63
++; CONTRACT-OFF-NEXT:    vfmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %negv0 = fneg contract <2 x double> %v0
++  %mul = fmul contract <2 x double> %negv0, %v1
++  %add = fadd contract <2 x double> %mul, %v2
++  store <2 x double> %add, ptr %res
++  ret void
++}
++
++define void @vfmadd_d_contract(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfmadd_d_contract:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfmadd_d_contract:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfmadd_d_contract:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %mul = fmul contract <2 x double> %v0, %v1
++  %add = fadd contract <2 x double> %mul, %v2
++  store <2 x double> %add, ptr %res
++  ret void
++}
++
++define void @vfmsub_d_contract(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfmsub_d_contract:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfmsub_d_contract:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfmsub_d_contract:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %mul = fmul contract <2 x double> %v0, %v1
++  %sub = fsub contract <2 x double> %mul, %v2
++  store <2 x double> %sub, ptr %res
++  ret void
++}
++
++define void @vfnmadd_d_contract(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfnmadd_d_contract:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfnmadd_d_contract:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfnmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfnmadd_d_contract:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfnmadd.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %mul = fmul contract <2 x double> %v0, %v1
++  %add = fadd contract <2 x double> %mul, %v2
++  %negadd = fneg contract <2 x double> %add
++  store <2 x double> %negadd, ptr %res
++  ret void
++}
++
++define void @vfnmsub_d_contract(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfnmsub_d_contract:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfnmsub_d_contract:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfnmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfnmsub_d_contract:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfnmsub.d $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <2 x double>, ptr %a0
++  %v1 = load <2 x double>, ptr %a1
++  %v2 = load <2 x double>, ptr %a2
++  %mul = fmul contract <2 x double> %v0, %v1
++  %negv2 = fneg contract <2 x double> %v2
++  %add = fadd contract <2 x double> %negv2, %mul
++  %negadd = fneg contract <2 x double> %add
++  store <2 x double> %negadd, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/fma-v4f32.ll b/llvm/test/CodeGen/LoongArch/lsx/fma-v4f32.ll
+new file mode 100644
+index 000000000000..7efbd61c0c4f
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/fma-v4f32.ll
+@@ -0,0 +1,804 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx --fp-contract=fast < %s \
++; RUN:   | FileCheck %s --check-prefix=CONTRACT-FAST
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx --fp-contract=on < %s \
++; RUN:   | FileCheck %s --check-prefix=CONTRACT-ON
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx --fp-contract=off < %s \
++; RUN:   | FileCheck %s --check-prefix=CONTRACT-OFF
++
++define void @vfmadd_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfmadd_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfmadd_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-ON-NEXT:    vfmul.s $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-ON-NEXT:    vfadd.s $vr0, $vr0, $vr1
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfmadd_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmul.s $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-OFF-NEXT:    vfadd.s $vr0, $vr0, $vr1
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %mul = fmul<4 x float> %v0, %v1
++  %add = fadd<4 x float> %mul, %v2
++  store <4 x float> %add, ptr %res
++  ret void
++}
++
++define void @vfmsub_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfmsub_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfmsub_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-ON-NEXT:    vfmul.s $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-ON-NEXT:    vfsub.s $vr0, $vr0, $vr1
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfmsub_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmul.s $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-OFF-NEXT:    vfsub.s $vr0, $vr0, $vr1
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %mul = fmul<4 x float> %v0, %v1
++  %sub = fsub<4 x float> %mul, %v2
++  store <4 x float> %sub, ptr %res
++  ret void
++}
++
++define void @vfnmadd_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfnmadd_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfnmadd_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-ON-NEXT:    vfmul.s $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-ON-NEXT:    vfadd.s $vr0, $vr0, $vr1
++; CONTRACT-ON-NEXT:    vbitrevi.w $vr0, $vr0, 31
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfnmadd_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmul.s $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-OFF-NEXT:    vfadd.s $vr0, $vr0, $vr1
++; CONTRACT-OFF-NEXT:    vbitrevi.w $vr0, $vr0, 31
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %mul = fmul<4 x float> %v0, %v1
++  %add = fadd<4 x float> %mul, %v2
++  %negadd = fneg<4 x float> %add
++  store <4 x float> %negadd, ptr %res
++  ret void
++}
++
++define void @vfnmadd_s_nsz(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfnmadd_s_nsz:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfnmadd_s_nsz:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-ON-NEXT:    vbitrevi.w $vr1, $vr1, 31
++; CONTRACT-ON-NEXT:    vfmul.s $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-ON-NEXT:    vfsub.s $vr0, $vr0, $vr1
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfnmadd_s_nsz:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-OFF-NEXT:    vbitrevi.w $vr1, $vr1, 31
++; CONTRACT-OFF-NEXT:    vfmul.s $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-OFF-NEXT:    vfsub.s $vr0, $vr0, $vr1
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %negv0 = fneg nsz<4 x float> %v0
++  %negv2 = fneg nsz<4 x float> %v2
++  %mul = fmul nsz<4 x float> %negv0, %v1
++  %add = fadd nsz<4 x float> %mul, %negv2
++  store <4 x float> %add, ptr %res
++  ret void
++}
++
++;; Check that vfnmadd.s is not emitted.
++define void @not_vfnmadd_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: not_vfnmadd_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vbitrevi.w $vr2, $vr2, 31
++; CONTRACT-FAST-NEXT:    vfmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: not_vfnmadd_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-ON-NEXT:    vbitrevi.w $vr1, $vr1, 31
++; CONTRACT-ON-NEXT:    vfmul.s $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-ON-NEXT:    vfsub.s $vr0, $vr0, $vr1
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: not_vfnmadd_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-OFF-NEXT:    vbitrevi.w $vr1, $vr1, 31
++; CONTRACT-OFF-NEXT:    vfmul.s $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-OFF-NEXT:    vfsub.s $vr0, $vr0, $vr1
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %negv0 = fneg<4 x float> %v0
++  %negv2 = fneg<4 x float> %v2
++  %mul = fmul<4 x float> %negv0, %v1
++  %add = fadd<4 x float> %mul, %negv2
++  store <4 x float> %add, ptr %res
++  ret void
++}
++
++define void @vfnmsub_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfnmsub_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfnmsub_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-ON-NEXT:    vfmul.s $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-ON-NEXT:    vfsub.s $vr0, $vr0, $vr1
++; CONTRACT-ON-NEXT:    vbitrevi.w $vr0, $vr0, 31
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfnmsub_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmul.s $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-OFF-NEXT:    vfsub.s $vr0, $vr0, $vr1
++; CONTRACT-OFF-NEXT:    vbitrevi.w $vr0, $vr0, 31
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %negv2 = fneg<4 x float> %v2
++  %mul = fmul<4 x float> %v0, %v1
++  %add = fadd<4 x float> %mul, %negv2
++  %neg = fneg<4 x float> %add
++  store <4 x float> %neg, ptr %res
++  ret void
++}
++
++define void @vfnmsub_s_nsz(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfnmsub_s_nsz:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfnmsub_s_nsz:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-ON-NEXT:    vfmul.s $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-ON-NEXT:    vfsub.s $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfnmsub_s_nsz:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmul.s $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-OFF-NEXT:    vfsub.s $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %negv0 = fneg nsz<4 x float> %v0
++  %mul = fmul nsz<4 x float> %negv0, %v1
++  %add = fadd nsz<4 x float> %mul, %v2
++  store <4 x float> %add, ptr %res
++  ret void
++}
++
++;; Check that vfnmsub.s is not emitted.
++define void @not_vfnmsub_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: not_vfnmsub_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vbitrevi.w $vr2, $vr2, 31
++; CONTRACT-FAST-NEXT:    vfmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: not_vfnmsub_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-ON-NEXT:    vfmul.s $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-ON-NEXT:    vfsub.s $vr0, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: not_vfnmsub_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmul.s $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a3, 0
++; CONTRACT-OFF-NEXT:    vfsub.s $vr0, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %negv0 = fneg<4 x float> %v0
++  %mul = fmul<4 x float> %negv0, %v1
++  %add = fadd<4 x float> %mul, %v2
++  store <4 x float> %add, ptr %res
++  ret void
++}
++
++define void @contract_vfmadd_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_vfmadd_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_vfmadd_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_vfmadd_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %mul = fmul contract <4 x float> %v0, %v1
++  %add = fadd contract <4 x float> %mul, %v2
++  store <4 x float> %add, ptr %res
++  ret void
++}
++
++define void @contract_vfmsub_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_vfmsub_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_vfmsub_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_vfmsub_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %mul = fmul contract <4 x float> %v0, %v1
++  %sub = fsub contract <4 x float> %mul, %v2
++  store <4 x float> %sub, ptr %res
++  ret void
++}
++
++define void @contract_vfnmadd_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_vfnmadd_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_vfnmadd_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfnmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_vfnmadd_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfnmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %mul = fmul contract <4 x float> %v0, %v1
++  %add = fadd contract <4 x float> %mul, %v2
++  %negadd = fneg contract <4 x float> %add
++  store <4 x float> %negadd, ptr %res
++  ret void
++}
++
++define void @contract_vfnmadd_s_nsz(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_vfnmadd_s_nsz:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_vfnmadd_s_nsz:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfnmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_vfnmadd_s_nsz:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfnmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %negv0 = fneg contract nsz<4 x float> %v0
++  %negv2 = fneg contract nsz<4 x float> %v2
++  %mul = fmul contract nsz<4 x float> %negv0, %v1
++  %add = fadd contract nsz<4 x float> %mul, %negv2
++  store <4 x float> %add, ptr %res
++  ret void
++}
++
++;; Check that vfnmadd.s is not emitted.
++define void @not_contract_vfnmadd_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: not_contract_vfnmadd_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vbitrevi.w $vr2, $vr2, 31
++; CONTRACT-FAST-NEXT:    vfmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: not_contract_vfnmadd_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vbitrevi.w $vr2, $vr2, 31
++; CONTRACT-ON-NEXT:    vfmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: not_contract_vfnmadd_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vbitrevi.w $vr2, $vr2, 31
++; CONTRACT-OFF-NEXT:    vfmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %negv0 = fneg contract <4 x float> %v0
++  %negv2 = fneg contract <4 x float> %v2
++  %mul = fmul contract <4 x float> %negv0, %v1
++  %add = fadd contract <4 x float> %mul, %negv2
++  store <4 x float> %add, ptr %res
++  ret void
++}
++
++define void @contract_vfnmsub_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_vfnmsub_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_vfnmsub_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfnmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_vfnmsub_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfnmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %negv2 = fneg contract <4 x float> %v2
++  %mul = fmul contract <4 x float> %v0, %v1
++  %add = fadd contract <4 x float> %mul, %negv2
++  %neg = fneg contract <4 x float> %add
++  store <4 x float> %neg, ptr %res
++  ret void
++}
++
++define void @contract_vfnmsub_s_nsz(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: contract_vfnmsub_s_nsz:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: contract_vfnmsub_s_nsz:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfnmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: contract_vfnmsub_s_nsz:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfnmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %negv0 = fneg contract nsz<4 x float> %v0
++  %mul = fmul contract nsz<4 x float> %negv0, %v1
++  %add = fadd contract nsz<4 x float> %mul, %v2
++  store <4 x float> %add, ptr %res
++  ret void
++}
++
++;; Check that vfnmsub.s is not emitted.
++define void @not_contract_vfnmsub_s(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: not_contract_vfnmsub_s:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vbitrevi.w $vr2, $vr2, 31
++; CONTRACT-FAST-NEXT:    vfmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: not_contract_vfnmsub_s:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vbitrevi.w $vr2, $vr2, 31
++; CONTRACT-ON-NEXT:    vfmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: not_contract_vfnmsub_s:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vbitrevi.w $vr2, $vr2, 31
++; CONTRACT-OFF-NEXT:    vfmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %negv0 = fneg contract <4 x float> %v0
++  %mul = fmul contract <4 x float> %negv0, %v1
++  %add = fadd contract <4 x float> %mul, %v2
++  store <4 x float> %add, ptr %res
++  ret void
++}
++
++define void @vfmadd_s_contract(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfmadd_s_contract:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfmadd_s_contract:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfmadd_s_contract:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %mul = fmul contract <4 x float> %v0, %v1
++  %add = fadd contract <4 x float> %mul, %v2
++  store <4 x float> %add, ptr %res
++  ret void
++}
++
++define void @vfmsub_s_contract(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfmsub_s_contract:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfmsub_s_contract:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfmsub_s_contract:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %mul = fmul contract <4 x float> %v0, %v1
++  %sub = fsub contract <4 x float> %mul, %v2
++  store <4 x float> %sub, ptr %res
++  ret void
++}
++
++define void @vfnmadd_s_contract(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfnmadd_s_contract:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfnmadd_s_contract:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfnmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfnmadd_s_contract:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfnmadd.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %mul = fmul contract <4 x float> %v0, %v1
++  %add = fadd contract <4 x float> %mul, %v2
++  %negadd = fneg contract <4 x float> %add
++  store <4 x float> %negadd, ptr %res
++  ret void
++}
++
++define void @vfnmsub_s_contract(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind {
++; CONTRACT-FAST-LABEL: vfnmsub_s_contract:
++; CONTRACT-FAST:       # %bb.0: # %entry
++; CONTRACT-FAST-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-FAST-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-FAST-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-FAST-NEXT:    vfnmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-FAST-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-FAST-NEXT:    ret
++;
++; CONTRACT-ON-LABEL: vfnmsub_s_contract:
++; CONTRACT-ON:       # %bb.0: # %entry
++; CONTRACT-ON-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-ON-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-ON-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-ON-NEXT:    vfnmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-ON-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-ON-NEXT:    ret
++;
++; CONTRACT-OFF-LABEL: vfnmsub_s_contract:
++; CONTRACT-OFF:       # %bb.0: # %entry
++; CONTRACT-OFF-NEXT:    vld $vr0, $a3, 0
++; CONTRACT-OFF-NEXT:    vld $vr1, $a2, 0
++; CONTRACT-OFF-NEXT:    vld $vr2, $a1, 0
++; CONTRACT-OFF-NEXT:    vfnmsub.s $vr0, $vr2, $vr1, $vr0
++; CONTRACT-OFF-NEXT:    vst $vr0, $a0, 0
++; CONTRACT-OFF-NEXT:    ret
++entry:
++  %v0 = load <4 x float>, ptr %a0
++  %v1 = load <4 x float>, ptr %a1
++  %v2 = load <4 x float>, ptr %a2
++  %mul = fmul contract <4 x float> %v0, %v1
++  %negv2 = fneg contract <4 x float> %v2
++  %add = fadd contract <4 x float> %negv2, %mul
++  %negadd = fneg contract <4 x float> %add
++  store <4 x float> %negadd, ptr %res
++  ret void
++}
+-- 
+2.20.1
+
+
+From 8aa8ce5abc7bf58ef9ae0460d1e9ed705895a887 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Mon, 25 Dec 2023 10:09:20 +0800
+Subject: [PATCH 27/35] [LoongArch] Fix LASX vector_extract codegen
+
+Custom lowering `ISD::EXTRACT_VECTOR_ELT` with lasx.
+
+(cherry picked from commit 47c88bcd5de91522241cca1aaa1b7762ceb01394)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       |  21 +++-
+ .../Target/LoongArch/LoongArchISelLowering.h  |   1 +
+ .../LoongArch/LoongArchLASXInstrInfo.td       |  40 ++----
+ .../lasx/ir-instruction/extractelement.ll     | 114 ++++++++++++++----
+ 4 files changed, 119 insertions(+), 57 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index c7f4b1d24f07..cf881ce720a6 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -277,7 +277,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+       setOperationAction(ISD::UNDEF, VT, Legal);
+ 
+       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+-      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
++      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ 
+       setOperationAction(ISD::SETCC, VT, Legal);
+@@ -395,6 +395,8 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
+     return lowerWRITE_REGISTER(Op, DAG);
+   case ISD::INSERT_VECTOR_ELT:
+     return lowerINSERT_VECTOR_ELT(Op, DAG);
++  case ISD::EXTRACT_VECTOR_ELT:
++    return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+   case ISD::BUILD_VECTOR:
+     return lowerBUILD_VECTOR(Op, DAG);
+   case ISD::VECTOR_SHUFFLE:
+@@ -502,6 +504,23 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
+   return SDValue();
+ }
+ 
++SDValue
++LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
++                                                 SelectionDAG &DAG) const {
++  EVT VecTy = Op->getOperand(0)->getValueType(0);
++  SDValue Idx = Op->getOperand(1);
++  EVT EltTy = VecTy.getVectorElementType();
++  unsigned NumElts = VecTy.getVectorNumElements();
++
++  if (isa<ConstantSDNode>(Idx) &&
++      (EltTy == MVT::i32 || EltTy == MVT::i64 || EltTy == MVT::f32 ||
++       EltTy == MVT::f64 ||
++       cast<ConstantSDNode>(Idx)->getZExtValue() < NumElts / 2))
++    return Op;
++
++  return SDValue();
++}
++
+ SDValue
+ LoongArchTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+index 2c35f9e5d378..6b5a851ec55d 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+@@ -279,6 +279,7 @@ private:
+   SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) const;
++  SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index ec6983d0f487..9b7a34688811 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -1590,38 +1590,14 @@ def : Pat<(i64 (vector_extract v32i8:$xj, uimm4:$imm)),
+           (VPICKVE2GR_B (EXTRACT_SUBREG v32i8:$xj, sub_128), uimm4:$imm)>;
+ def : Pat<(i64 (vector_extract v16i16:$xj, uimm3:$imm)),
+           (VPICKVE2GR_H (EXTRACT_SUBREG v16i16:$xj, sub_128), uimm3:$imm)>;
+-def : Pat<(i64 (vector_extract v8i32:$xj, uimm2:$imm)),
+-          (VPICKVE2GR_W (EXTRACT_SUBREG v8i32:$xj, sub_128), uimm2:$imm)>;
+-def : Pat<(i64 (vector_extract v4i64:$xj, uimm1:$imm)),
+-          (VPICKVE2GR_D (EXTRACT_SUBREG v4i64:$xj, sub_128), uimm1:$imm)>;
+-def : Pat<(f32 (vector_extract v8f32:$xj, uimm2:$imm)),
+-          (f32 (EXTRACT_SUBREG (XVREPL128VEI_W v8f32:$xj, uimm2:$imm), sub_32))>;
+-def : Pat<(f64 (vector_extract v4f64:$xj, uimm1:$imm)),
+-          (f64 (EXTRACT_SUBREG (XVREPL128VEI_D v4f64:$xj, uimm1:$imm), sub_64))>;
+-
+-// Vector extraction with variable index.
+-def : Pat<(i64 (vector_extract v32i8:$xj, i64:$rk)),
+-          (SRAI_W (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG (XVREPLVE_B v32i8:$xj,
+-                                                                    i64:$rk),
+-                                                         sub_32)),
+-                                    GPR), (i64 24))>;
+-def : Pat<(i64 (vector_extract v16i16:$xj, i64:$rk)),
+-          (SRAI_W (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG (XVREPLVE_H v16i16:$xj,
+-                                                                    i64:$rk),
+-                                                         sub_32)),
+-                                    GPR), (i64 16))>;
+-def : Pat<(i64 (vector_extract v8i32:$xj, i64:$rk)),
+-          (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG (XVREPLVE_W v8i32:$xj, i64:$rk),
+-                                                 sub_32)),
+-                            GPR)>;
+-def : Pat<(i64 (vector_extract v4i64:$xj, i64:$rk)),
+-          (COPY_TO_REGCLASS (f64 (EXTRACT_SUBREG (XVREPLVE_D v4i64:$xj, i64:$rk),
+-                                                 sub_64)),
+-                            GPR)>;
+-def : Pat<(f32 (vector_extract v8f32:$xj, i64:$rk)),
+-          (f32 (EXTRACT_SUBREG (XVREPLVE_W v8f32:$xj, i64:$rk), sub_32))>;
+-def : Pat<(f64 (vector_extract v4f64:$xj, i64:$rk)),
+-          (f64 (EXTRACT_SUBREG (XVREPLVE_D v4f64:$xj, i64:$rk), sub_64))>;
++def : Pat<(i64 (vector_extract v8i32:$xj, uimm3:$imm)),
++          (XVPICKVE2GR_W v8i32:$xj, uimm3:$imm)>;
++def : Pat<(i64 (vector_extract v4i64:$xj, uimm2:$imm)),
++          (XVPICKVE2GR_D v4i64:$xj, uimm2:$imm)>;
++def : Pat<(f32 (vector_extract v8f32:$xj, uimm3:$imm)),
++          (MOVGR2FR_W (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm))>;
++def : Pat<(f64 (vector_extract v4f64:$xj, uimm2:$imm)),
++          (MOVGR2FR_D (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm))>;
+ 
+ // vselect
+ def : Pat<(v32i8 (vselect LASX256:$xj, LASX256:$xd,
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
+index 78f584cd09a8..02b76bf75b75 100644
+--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
+@@ -31,7 +31,7 @@ define void @extract_8xi32(ptr %src, ptr %dst) nounwind {
+ ; CHECK-LABEL: extract_8xi32:
+ ; CHECK:       # %bb.0:
+ ; CHECK-NEXT:    xvld $xr0, $a0, 0
+-; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 1
++; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 1
+ ; CHECK-NEXT:    st.w $a0, $a1, 0
+ ; CHECK-NEXT:    ret
+   %v = load volatile <8 x i32>, ptr %src
+@@ -44,7 +44,7 @@ define void @extract_4xi64(ptr %src, ptr %dst) nounwind {
+ ; CHECK-LABEL: extract_4xi64:
+ ; CHECK:       # %bb.0:
+ ; CHECK-NEXT:    xvld $xr0, $a0, 0
+-; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
++; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
+ ; CHECK-NEXT:    st.d $a0, $a1, 0
+ ; CHECK-NEXT:    ret
+   %v = load volatile <4 x i64>, ptr %src
+@@ -57,8 +57,8 @@ define void @extract_8xfloat(ptr %src, ptr %dst) nounwind {
+ ; CHECK-LABEL: extract_8xfloat:
+ ; CHECK:       # %bb.0:
+ ; CHECK-NEXT:    xvld $xr0, $a0, 0
+-; CHECK-NEXT:    ori $a0, $zero, 7
+-; CHECK-NEXT:    xvreplve.w $xr0, $xr0, $a0
++; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 7
++; CHECK-NEXT:    movgr2fr.w $fa0, $a0
+ ; CHECK-NEXT:    fst.s $fa0, $a1, 0
+ ; CHECK-NEXT:    ret
+   %v = load volatile <8 x float>, ptr %src
+@@ -71,8 +71,8 @@ define void @extract_4xdouble(ptr %src, ptr %dst) nounwind {
+ ; CHECK-LABEL: extract_4xdouble:
+ ; CHECK:       # %bb.0:
+ ; CHECK-NEXT:    xvld $xr0, $a0, 0
+-; CHECK-NEXT:    ori $a0, $zero, 3
+-; CHECK-NEXT:    xvreplve.d $xr0, $xr0, $a0
++; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
++; CHECK-NEXT:    movgr2fr.d $fa0, $a0
+ ; CHECK-NEXT:    fst.d $fa0, $a1, 0
+ ; CHECK-NEXT:    ret
+   %v = load volatile <4 x double>, ptr %src
+@@ -84,12 +84,22 @@ define void @extract_4xdouble(ptr %src, ptr %dst) nounwind {
+ define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
+ ; CHECK-LABEL: extract_32xi8_idx:
+ ; CHECK:       # %bb.0:
+-; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    addi.d $sp, $sp, -64
++; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; CHECK-NEXT:    addi.d $fp, $sp, 64
++; CHECK-NEXT:    srli.d $a3, $sp, 5
++; CHECK-NEXT:    slli.d $sp, $a3, 5
+ ; CHECK-NEXT:    xvld $xr0, $a0, 0
+-; CHECK-NEXT:    xvreplve.b $xr0, $xr0, $a2
+-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
+-; CHECK-NEXT:    srai.w $a0, $a0, 24
++; CHECK-NEXT:    xvst $xr0, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 0
++; CHECK-NEXT:    ld.b $a0, $a0, 0
+ ; CHECK-NEXT:    st.b $a0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $fp, -64
++; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; CHECK-NEXT:    addi.d $sp, $sp, 64
+ ; CHECK-NEXT:    ret
+   %v = load volatile <32 x i8>, ptr %src
+   %e = extractelement <32 x i8> %v, i32 %idx
+@@ -100,12 +110,22 @@ define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
+ define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
+ ; CHECK-LABEL: extract_16xi16_idx:
+ ; CHECK:       # %bb.0:
+-; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    addi.d $sp, $sp, -64
++; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; CHECK-NEXT:    addi.d $fp, $sp, 64
++; CHECK-NEXT:    srli.d $a3, $sp, 5
++; CHECK-NEXT:    slli.d $sp, $a3, 5
+ ; CHECK-NEXT:    xvld $xr0, $a0, 0
+-; CHECK-NEXT:    xvreplve.h $xr0, $xr0, $a2
+-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
+-; CHECK-NEXT:    srai.w $a0, $a0, 16
++; CHECK-NEXT:    xvst $xr0, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 1
++; CHECK-NEXT:    ld.h $a0, $a0, 0
+ ; CHECK-NEXT:    st.h $a0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $fp, -64
++; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; CHECK-NEXT:    addi.d $sp, $sp, 64
+ ; CHECK-NEXT:    ret
+   %v = load volatile <16 x i16>, ptr %src
+   %e = extractelement <16 x i16> %v, i32 %idx
+@@ -116,11 +136,22 @@ define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
+ define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
+ ; CHECK-LABEL: extract_8xi32_idx:
+ ; CHECK:       # %bb.0:
+-; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    addi.d $sp, $sp, -64
++; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; CHECK-NEXT:    addi.d $fp, $sp, 64
++; CHECK-NEXT:    srli.d $a3, $sp, 5
++; CHECK-NEXT:    slli.d $sp, $a3, 5
+ ; CHECK-NEXT:    xvld $xr0, $a0, 0
+-; CHECK-NEXT:    xvreplve.w $xr0, $xr0, $a2
+-; CHECK-NEXT:    movfr2gr.s $a0, $fa0
++; CHECK-NEXT:    xvst $xr0, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
++; CHECK-NEXT:    ld.w $a0, $a0, 0
+ ; CHECK-NEXT:    st.w $a0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $fp, -64
++; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; CHECK-NEXT:    addi.d $sp, $sp, 64
+ ; CHECK-NEXT:    ret
+   %v = load volatile <8 x i32>, ptr %src
+   %e = extractelement <8 x i32> %v, i32 %idx
+@@ -131,11 +162,22 @@ define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
+ define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
+ ; CHECK-LABEL: extract_4xi64_idx:
+ ; CHECK:       # %bb.0:
+-; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    addi.d $sp, $sp, -64
++; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; CHECK-NEXT:    addi.d $fp, $sp, 64
++; CHECK-NEXT:    srli.d $a3, $sp, 5
++; CHECK-NEXT:    slli.d $sp, $a3, 5
+ ; CHECK-NEXT:    xvld $xr0, $a0, 0
+-; CHECK-NEXT:    xvreplve.d $xr0, $xr0, $a2
+-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
++; CHECK-NEXT:    xvst $xr0, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
++; CHECK-NEXT:    ld.d $a0, $a0, 0
+ ; CHECK-NEXT:    st.d $a0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $fp, -64
++; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; CHECK-NEXT:    addi.d $sp, $sp, 64
+ ; CHECK-NEXT:    ret
+   %v = load volatile <4 x i64>, ptr %src
+   %e = extractelement <4 x i64> %v, i32 %idx
+@@ -146,10 +188,22 @@ define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
+ define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
+ ; CHECK-LABEL: extract_8xfloat_idx:
+ ; CHECK:       # %bb.0:
+-; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    addi.d $sp, $sp, -64
++; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; CHECK-NEXT:    addi.d $fp, $sp, 64
++; CHECK-NEXT:    srli.d $a3, $sp, 5
++; CHECK-NEXT:    slli.d $sp, $a3, 5
+ ; CHECK-NEXT:    xvld $xr0, $a0, 0
+-; CHECK-NEXT:    xvreplve.w $xr0, $xr0, $a2
++; CHECK-NEXT:    xvst $xr0, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
++; CHECK-NEXT:    fld.s $fa0, $a0, 0
+ ; CHECK-NEXT:    fst.s $fa0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $fp, -64
++; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; CHECK-NEXT:    addi.d $sp, $sp, 64
+ ; CHECK-NEXT:    ret
+   %v = load volatile <8 x float>, ptr %src
+   %e = extractelement <8 x float> %v, i32 %idx
+@@ -160,10 +214,22 @@ define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
+ define void @extract_4xdouble_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
+ ; CHECK-LABEL: extract_4xdouble_idx:
+ ; CHECK:       # %bb.0:
+-; CHECK-NEXT:    bstrpick.d $a2, $a2, 31, 0
++; CHECK-NEXT:    addi.d $sp, $sp, -64
++; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; CHECK-NEXT:    addi.d $fp, $sp, 64
++; CHECK-NEXT:    srli.d $a3, $sp, 5
++; CHECK-NEXT:    slli.d $sp, $a3, 5
+ ; CHECK-NEXT:    xvld $xr0, $a0, 0
+-; CHECK-NEXT:    xvreplve.d $xr0, $xr0, $a2
++; CHECK-NEXT:    xvst $xr0, $sp, 0
++; CHECK-NEXT:    addi.d $a0, $sp, 0
++; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
++; CHECK-NEXT:    fld.d $fa0, $a0, 0
+ ; CHECK-NEXT:    fst.d $fa0, $a1, 0
++; CHECK-NEXT:    addi.d $sp, $fp, -64
++; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; CHECK-NEXT:    addi.d $sp, $sp, 64
+ ; CHECK-NEXT:    ret
+   %v = load volatile <4 x double>, ptr %src
+   %e = extractelement <4 x double> %v, i32 %idx
+-- 
+2.20.1
+
+
+From 5953c8d6a82ac2ad2438de5dd46525a9f7a084fb Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Wed, 27 Dec 2023 16:31:49 +0800
+Subject: [PATCH 28/35] [LoongArch] Fix incorrect pattern XVREPL128VEI_{W/D}
+ instructions
+
+Remove the incorrect patterns for `XVREPL128VEI_{W/D}` instructions,
+and add correct patterns for XVREPLVE0_{W/D} instructions
+
+(cherry picked from commit c7367f985e0d27aeb8bc993406d1b9f4ca307399)
+---
+ llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td | 6 +++---
+ llvm/test/CodeGen/LoongArch/lasx/build-vector.ll    | 4 ++--
+ 2 files changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index 9b7a34688811..059689cef840 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -1571,11 +1571,11 @@ def : Pat<(loongarch_vreplve v8i32:$xj, GRLenVT:$rk),
+ def : Pat<(loongarch_vreplve v4i64:$xj, GRLenVT:$rk),
+           (XVREPLVE_D v4i64:$xj, GRLenVT:$rk)>;
+ 
+-// XVREPL128VEI_{W/D}
++// XVREPLVE0_{W/D}
+ def : Pat<(lasxsplatf32 FPR32:$fj),
+-          (XVREPL128VEI_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), 0)>;
++          (XVREPLVE0_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32))>;
+ def : Pat<(lasxsplatf64 FPR64:$fj),
+-          (XVREPL128VEI_D (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), 0)>;
++          (XVREPLVE0_D (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64))>;
+ 
+ // Loads/Stores
+ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in {
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
+index 6824ab5cda8d..ae6f31aaec64 100644
+--- a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
++++ b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll
+@@ -57,7 +57,7 @@ define void @buildvector_v8f32_splat(ptr %dst, float %a0) nounwind {
+ ; CHECK-LABEL: buildvector_v8f32_splat:
+ ; CHECK:       # %bb.0: # %entry
+ ; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+-; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 0
++; CHECK-NEXT:    xvreplve0.w $xr0, $xr0
+ ; CHECK-NEXT:    xvst $xr0, $a0, 0
+ ; CHECK-NEXT:    ret
+ entry:
+@@ -71,7 +71,7 @@ define void @buildvector_v4f64_splat(ptr %dst, double %a0) nounwind {
+ ; CHECK-LABEL: buildvector_v4f64_splat:
+ ; CHECK:       # %bb.0: # %entry
+ ; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+-; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr0, 0
++; CHECK-NEXT:    xvreplve0.d $xr0, $xr0
+ ; CHECK-NEXT:    xvst $xr0, $a0, 0
+ ; CHECK-NEXT:    ret
+ entry:
+-- 
+2.20.1
+
+
+From 7e21c962da87491bb438ea3906826875f53f2931 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Thu, 28 Dec 2023 20:56:32 +0800
+Subject: [PATCH 29/35] [LoongArch] Fix incorrect pattern [X]VBITSELI_B
+ instructions
+
+Adjusted the operand order of [X]VBITSELI_B to correctly match vselect.
+
+(cherry picked from commit da5378e87e11689d05a58198d6e15e9551916794)
+---
+ llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td | 4 ++--
+ llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td  | 4 ++--
+ llvm/test/CodeGen/LoongArch/lasx/vselect.ll         | 6 +++---
+ llvm/test/CodeGen/LoongArch/lsx/vselect.ll          | 6 +++---
+ 4 files changed, 10 insertions(+), 10 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index 059689cef840..b3c11bc5423d 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -1600,8 +1600,8 @@ def : Pat<(f64 (vector_extract v4f64:$xj, uimm2:$imm)),
+           (MOVGR2FR_D (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm))>;
+ 
+ // vselect
+-def : Pat<(v32i8 (vselect LASX256:$xj, LASX256:$xd,
+-                          (v32i8 (SplatPat_uimm8 uimm8:$imm)))),
++def : Pat<(v32i8 (vselect LASX256:$xd, (v32i8 (SplatPat_uimm8 uimm8:$imm)),
++                          LASX256:$xj)),
+           (XVBITSELI_B LASX256:$xd, LASX256:$xj, uimm8:$imm)>;
+ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in
+   def  : Pat<(vt (vselect LASX256:$xa, LASX256:$xk, LASX256:$xj)),
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index e468176885d7..5569c2cd15b5 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -1731,8 +1731,8 @@ def : Pat<(f64 (vector_extract v2f64:$vj, i64:$rk)),
+           (f64 (EXTRACT_SUBREG (VREPLVE_D v2f64:$vj, i64:$rk), sub_64))>;
+ 
+ // vselect
+-def : Pat<(v16i8 (vselect LSX128:$vj, LSX128:$vd,
+-                          (v16i8 (SplatPat_uimm8 uimm8:$imm)))),
++def : Pat<(v16i8 (vselect LSX128:$vd, (v16i8 (SplatPat_uimm8 uimm8:$imm)),
++                          LSX128:$vj)),
+           (VBITSELI_B LSX128:$vd, LSX128:$vj, uimm8:$imm)>;
+ foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in
+   def  : Pat<(vt (vselect LSX128:$va, LSX128:$vk, LSX128:$vj)),
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/vselect.ll b/llvm/test/CodeGen/LoongArch/lasx/vselect.ll
+index 24f4bcf752d3..ec2fc28db33c 100644
+--- a/llvm/test/CodeGen/LoongArch/lasx/vselect.ll
++++ b/llvm/test/CodeGen/LoongArch/lasx/vselect.ll
+@@ -6,11 +6,11 @@ define void @select_v32i8_imm(ptr %res, ptr %a0) nounwind {
+ ; CHECK:       # %bb.0:
+ ; CHECK-NEXT:    xvld $xr0, $a1, 0
+ ; CHECK-NEXT:    xvrepli.h $xr1, -256
+-; CHECK-NEXT:    xvbitseli.b $xr0, $xr1, 1
+-; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    xvbitseli.b $xr1, $xr0, 1
++; CHECK-NEXT:    xvst $xr1, $a0, 0
+ ; CHECK-NEXT:    ret
+   %v0 = load <32 x i8>, ptr %a0
+-  %sel = select <32 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <32 x i8> %v0, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
++  %sel = select <32 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <32 x i8> %v0
+   store <32 x i8> %sel, ptr %res
+   ret void
+ }
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/vselect.ll b/llvm/test/CodeGen/LoongArch/lsx/vselect.ll
+index 00e3d9313f13..746152f0f026 100644
+--- a/llvm/test/CodeGen/LoongArch/lsx/vselect.ll
++++ b/llvm/test/CodeGen/LoongArch/lsx/vselect.ll
+@@ -6,11 +6,11 @@ define void @select_v16i8_imm(ptr %res, ptr %a0) nounwind {
+ ; CHECK:       # %bb.0:
+ ; CHECK-NEXT:    vld $vr0, $a1, 0
+ ; CHECK-NEXT:    vrepli.h $vr1, -256
+-; CHECK-NEXT:    vbitseli.b $vr0, $vr1, 255
+-; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    vbitseli.b $vr1, $vr0, 255
++; CHECK-NEXT:    vst $vr1, $a0, 0
+ ; CHECK-NEXT:    ret
+   %v0 = load <16 x i8>, ptr %a0
+-  %sel = select <16 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <16 x i8> %v0, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
++  %sel = select <16 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %v0
+   store <16 x i8> %sel, ptr %res
+   ret void
+ }
+-- 
+2.20.1
+
+
+From 9aab6c004b73d1069444b17a9768310f288b3130 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Tue, 23 Jan 2024 09:06:35 +0800
+Subject: [PATCH 30/35] [LoongArch] Permit auto-vectorization using LSX/LASX
+ with `auto-vec` feature (#78943)
+
+With enough codegen complete, we can now correctly report the size of
+vector registers for LSX/LASX, allowing auto vectorization (The
+`auto-vec` feature needs to be enabled simultaneously).
+
+As described, the `auto-vec` feature is an experimental one. To ensure
+that automatic vectorization is not enabled by default, because the
+information provided by the current `TTI` cannot yield additional
+benefits for automatic vectorization.
+
+(cherry picked from commit fcff4582f01db2f5a99e3acf452aec9f2d8a126a)
+---
+ llvm/lib/Target/LoongArch/LoongArch.td        |  4 ++
+ .../lib/Target/LoongArch/LoongArchSubtarget.h |  2 +
+ .../LoongArchTargetTransformInfo.cpp          | 18 +++++
+ .../LoongArch/LoongArchTargetTransformInfo.h  |  2 +
+ .../LoopVectorize/LoongArch/defaults.ll       | 66 +++++++++++++++++++
+ .../LoopVectorize/LoongArch/lit.local.cfg     |  4 ++
+ 6 files changed, 96 insertions(+)
+ create mode 100644 llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
+ create mode 100644 llvm/test/Transforms/LoopVectorize/LoongArch/lit.local.cfg
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
+index 75b65fe69f26..2a4c991a43b0 100644
+--- a/llvm/lib/Target/LoongArch/LoongArch.td
++++ b/llvm/lib/Target/LoongArch/LoongArch.td
+@@ -105,6 +105,10 @@ def FeatureUAL
+ def FeatureRelax
+     : SubtargetFeature<"relax", "HasLinkerRelax", "true",
+                        "Enable Linker relaxation">;
++// Experimental auto vectorization
++def FeatureAutoVec
++    : SubtargetFeature<"auto-vec", "HasExpAutoVec", "true",
++                       "Experimental auto vectorization">;
+ 
+ //===----------------------------------------------------------------------===//
+ // Registers, instruction descriptions ...
+diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+index 5c173675cca4..174e4cba8326 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
++++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+@@ -44,6 +44,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
+   bool HasLaLocalWithAbs = false;
+   bool HasUAL = false;
+   bool HasLinkerRelax = false;
++  bool HasExpAutoVec = false;
+   unsigned GRLen = 32;
+   MVT GRLenVT = MVT::i32;
+   LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown;
+@@ -102,6 +103,7 @@ public:
+   bool hasLaLocalWithAbs() const { return HasLaLocalWithAbs; }
+   bool hasUAL() const { return HasUAL; }
+   bool hasLinkerRelax() const { return HasLinkerRelax; }
++  bool hasExpAutoVec() const { return HasExpAutoVec; }
+   MVT getGRLenVT() const { return GRLenVT; }
+   unsigned getGRLen() const { return GRLen; }
+   LoongArchABI::ABI getTargetABI() const { return TargetABI; }
+diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+index a6de86eea116..04349aa52b54 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+@@ -19,4 +19,22 @@ using namespace llvm;
+ 
+ #define DEBUG_TYPE "loongarchtti"
+ 
++TypeSize LoongArchTTIImpl::getRegisterBitWidth(
++    TargetTransformInfo::RegisterKind K) const {
++  switch (K) {
++  case TargetTransformInfo::RGK_Scalar:
++    return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
++  case TargetTransformInfo::RGK_FixedWidthVector:
++    if (ST->hasExtLASX() && ST->hasExpAutoVec())
++      return TypeSize::getFixed(256);
++    if (ST->hasExtLSX() && ST->hasExpAutoVec())
++      return TypeSize::getFixed(128);
++    return TypeSize::getFixed(0);
++  case TargetTransformInfo::RGK_ScalableVector:
++    return TypeSize::getScalable(0);
++  }
++
++  llvm_unreachable("Unsupported register kind");
++}
++
+ // TODO: Implement more hooks to provide TTI machinery for LoongArch.
+diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
+index 9e02f793ba8a..d296c9ed576f 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
++++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
+@@ -39,6 +39,8 @@ public:
+       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+         TLI(ST->getTargetLowering()) {}
+ 
++  TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
++
+   // TODO: Implement more hooks to provide TTI machinery for LoongArch.
+ };
+ 
+diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
+new file mode 100644
+index 000000000000..a8ac2411dd82
+--- /dev/null
++++ b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
+@@ -0,0 +1,66 @@
++; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
++; RUN: opt < %s -passes=loop-vectorize -mtriple loongarch64-linux-gnu -mattr=+lasx,+auto-vec -S | FileCheck %s
++
++;; This is a collection of tests whose only purpose is to show changes in the
++;; default configuration.  Please keep these tests minimal - if you're testing
++;; functionality of some specific configuration, please place that in a
++;; seperate test file with a hard coded configuration (even if that
++;; configuration is the current default).
++
++target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
++target triple = "loongarch64"
++
++define void @vector_add(ptr noalias nocapture %a, i64 %v) {
++; CHECK-LABEL: define void @vector_add
++; CHECK-SAME: (ptr noalias nocapture [[A:%.*]], i64 [[V:%.*]]) #[[ATTR0:[0-9]+]] {
++; CHECK-NEXT:  entry:
++; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
++; CHECK:       vector.ph:
++; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
++; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
++; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
++; CHECK:       vector.body:
++; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
++; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
++; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
++; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
++; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
++; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
++; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP2]], align 8
++; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
++; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
++; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
++; CHECK:       middle.block:
++; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
++; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
++; CHECK:       scalar.ph:
++; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
++; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
++; CHECK:       for.body:
++; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
++; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
++; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
++; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[ELEM]], [[V]]
++; CHECK-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
++; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
++; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
++; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
++; CHECK:       for.end:
++; CHECK-NEXT:    ret void
++;
++entry:
++  br label %for.body
++
++for.body:
++  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
++  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
++  %elem = load i64, ptr %arrayidx
++  %add = add i64 %elem, %v
++  store i64 %add, ptr %arrayidx
++  %iv.next = add nuw nsw i64 %iv, 1
++  %exitcond.not = icmp eq i64 %iv.next, 1024
++  br i1 %exitcond.not, label %for.end, label %for.body
++
++for.end:
++  ret void
++}
+diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/LoongArch/lit.local.cfg
+new file mode 100644
+index 000000000000..9570af17fe5f
+--- /dev/null
++++ b/llvm/test/Transforms/LoopVectorize/LoongArch/lit.local.cfg
+@@ -0,0 +1,4 @@
++config.suffixes = [".ll"]
++
++if not "LoongArch" in config.root.targets:
++    config.unsupported = True
+-- 
+2.20.1
+
+
+From 61b8589c3c71026af28b6a71e0b5d4c41b7c78d1 Mon Sep 17 00:00:00 2001
+From: yjijd <licongtian@loongson.cn>
+Date: Tue, 23 Jan 2024 15:16:23 +0800
+Subject: [PATCH 31/35] [CodeGen][LoongArch] Set SINT_TO_FP/UINT_TO_FP to legal
+ for vector types (#78924)
+
+Support the following conversions:
+v4i32->v4f32, v2i64->v2f64(LSX)
+v8i32->v8f32, v4i64->v4f64(LASX)
+v4i32->v4f64, v4i64->v4f32(LASX)
+
+(cherry picked from commit f799f936929c232a16abc7c520a10fecadbf05f9)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       |  4 ++
+ .../LoongArch/LoongArchLASXInstrInfo.td       | 22 +++++++
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td |  8 +++
+ .../LoongArch/lasx/ir-instruction/sitofp.ll   | 57 +++++++++++++++++++
+ .../LoongArch/lasx/ir-instruction/uitofp.ll   | 57 +++++++++++++++++++
+ .../LoongArch/lsx/ir-instruction/sitofp.ll    | 28 +++++++++
+ .../LoongArch/lsx/ir-instruction/uitofp.ll    | 28 +++++++++
+ 7 files changed, 204 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sitofp.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/uitofp.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sitofp.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/uitofp.ll
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index cf881ce720a6..7a360b42e15d 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -256,6 +256,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+           {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
+           Expand);
+     }
++    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP},
++                       {MVT::v4i32, MVT::v2i64}, Legal);
+     for (MVT VT : {MVT::v4f32, MVT::v2f64}) {
+       setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
+       setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal);
+@@ -298,6 +300,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+           {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
+           Expand);
+     }
++    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP},
++                       {MVT::v8i32, MVT::v4i32, MVT::v4i64}, Legal);
+     for (MVT VT : {MVT::v8f32, MVT::v4f64}) {
+       setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
+       setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal);
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index b3c11bc5423d..b3e74b480922 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -1611,6 +1611,28 @@ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in
+ def : Pat<(fneg (v8f32 LASX256:$xj)), (XVBITREVI_W LASX256:$xj, 31)>;
+ def : Pat<(fneg (v4f64 LASX256:$xj)), (XVBITREVI_D LASX256:$xj, 63)>;
+ 
++// XVFFINT_{S_W/D_L}
++def : Pat<(v8f32 (sint_to_fp v8i32:$vj)), (XVFFINT_S_W v8i32:$vj)>;
++def : Pat<(v4f64 (sint_to_fp v4i64:$vj)), (XVFFINT_D_L v4i64:$vj)>;
++def : Pat<(v4f64 (sint_to_fp v4i32:$vj)),
++          (XVFFINT_D_L (VEXT2XV_D_W (SUBREG_TO_REG (i64 0), v4i32:$vj,
++                                                   sub_128)))>;
++def : Pat<(v4f32 (sint_to_fp v4i64:$vj)),
++          (EXTRACT_SUBREG (XVFCVT_S_D (XVPERMI_D (XVFFINT_D_L v4i64:$vj), 238),
++                                      (XVFFINT_D_L v4i64:$vj)),
++                          sub_128)>;
++
++// XVFFINT_{S_WU/D_LU}
++def : Pat<(v8f32 (uint_to_fp v8i32:$vj)), (XVFFINT_S_WU v8i32:$vj)>;
++def : Pat<(v4f64 (uint_to_fp v4i64:$vj)), (XVFFINT_D_LU v4i64:$vj)>;
++def : Pat<(v4f64 (uint_to_fp v4i32:$vj)),
++          (XVFFINT_D_LU (VEXT2XV_DU_WU (SUBREG_TO_REG (i64 0), v4i32:$vj,
++                                                      sub_128)))>;
++def : Pat<(v4f32 (uint_to_fp v4i64:$vj)),
++          (EXTRACT_SUBREG (XVFCVT_S_D (XVPERMI_D (XVFFINT_D_LU v4i64:$vj), 238),
++                                       (XVFFINT_D_LU v4i64:$vj)),
++                          sub_128)>;
++
+ } // Predicates = [HasExtLASX]
+ 
+ /// Intrinsic pattern
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index 5569c2cd15b5..63eac4d1aeb7 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -1742,6 +1742,14 @@ foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in
+ def : Pat<(fneg (v4f32 LSX128:$vj)), (VBITREVI_W LSX128:$vj, 31)>;
+ def : Pat<(fneg (v2f64 LSX128:$vj)), (VBITREVI_D LSX128:$vj, 63)>;
+ 
++// VFFINT_{S_W/D_L}
++def : Pat<(v4f32 (sint_to_fp v4i32:$vj)), (VFFINT_S_W v4i32:$vj)>;
++def : Pat<(v2f64 (sint_to_fp v2i64:$vj)), (VFFINT_D_L v2i64:$vj)>;
++
++// VFFINT_{S_WU/D_LU}
++def : Pat<(v4f32 (uint_to_fp v4i32:$vj)), (VFFINT_S_WU v4i32:$vj)>;
++def : Pat<(v2f64 (uint_to_fp v2i64:$vj)), (VFFINT_D_LU v2i64:$vj)>;
++
+ } // Predicates = [HasExtLSX]
+ 
+ /// Intrinsic pattern
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sitofp.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sitofp.ll
+new file mode 100644
+index 000000000000..208a758ea4e9
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sitofp.ll
+@@ -0,0 +1,57 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @sitofp_v8i32_v8f32(ptr %res, ptr %in){
++; CHECK-LABEL: sitofp_v8i32_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvffint.s.w $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %in
++  %v1 = sitofp <8 x i32> %v0 to <8 x float>
++  store <8 x float> %v1, ptr %res
++  ret void
++}
++
++define void @sitofp_v4f64_v4f64(ptr %res, ptr %in){
++; CHECK-LABEL: sitofp_v4f64_v4f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvffint.d.l $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %in
++  %v1 = sitofp <4 x i64> %v0 to <4 x double>
++  store <4 x double> %v1, ptr %res
++  ret void
++}
++
++define void @sitofp_v4i64_v4f32(ptr %res, ptr %in){
++; CHECK-LABEL: sitofp_v4i64_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvffint.d.l $xr0, $xr0
++; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 238
++; CHECK-NEXT:    xvfcvt.s.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %in
++  %v1 = sitofp <4 x i64> %v0 to <4 x float>
++  store <4 x float> %v1, ptr %res
++  ret void
++}
++
++define void @sitofp_v4i32_v4f64(ptr %res, ptr %in){
++; CHECK-LABEL: sitofp_v4i32_v4f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vext2xv.d.w $xr0, $xr0
++; CHECK-NEXT:    xvffint.d.l $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %in
++  %v1 = sitofp <4 x i32> %v0 to <4 x double>
++  store <4 x double> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/uitofp.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/uitofp.ll
+new file mode 100644
+index 000000000000..70cf71c4cec2
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/uitofp.ll
+@@ -0,0 +1,57 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @uitofp_v8i32_v8f32(ptr %res, ptr %in){
++; CHECK-LABEL: uitofp_v8i32_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvffint.s.wu $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x i32>, ptr %in
++  %v1 = uitofp <8 x i32> %v0 to <8 x float>
++  store <8 x float> %v1, ptr %res
++  ret void
++}
++
++define void @uitofp_v4f64_v4f64(ptr %res, ptr %in){
++; CHECK-LABEL: uitofp_v4f64_v4f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvffint.d.lu $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %in
++  %v1 = uitofp <4 x i64> %v0 to <4 x double>
++  store <4 x double> %v1, ptr %res
++  ret void
++}
++
++define void @uitofp_v4i64_v4f32(ptr %res, ptr %in){
++; CHECK-LABEL: uitofp_v4i64_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvffint.d.lu $xr0, $xr0
++; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 238
++; CHECK-NEXT:    xvfcvt.s.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i64>, ptr %in
++  %v1 = uitofp <4 x i64> %v0 to <4 x float>
++  store <4 x float> %v1, ptr %res
++  ret void
++}
++
++define void @uitofp_v4i32_v4f64(ptr %res, ptr %in){
++; CHECK-LABEL: uitofp_v4i32_v4f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vext2xv.du.wu $xr0, $xr0
++; CHECK-NEXT:    xvffint.d.lu $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %in
++  %v1 = uitofp <4 x i32> %v0 to <4 x double>
++  store <4 x double> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sitofp.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sitofp.ll
+new file mode 100644
+index 000000000000..1e820a37a240
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sitofp.ll
+@@ -0,0 +1,28 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @sitofp_v4i32_v4f32(ptr %res, ptr %in){
++; CHECK-LABEL: sitofp_v4i32_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vffint.s.w $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %in
++  %v1 = sitofp <4 x i32> %v0 to <4 x float>
++  store <4 x float> %v1, ptr %res
++  ret void
++}
++
++define void @sitofp_v2i64_v2f64(ptr %res, ptr %in){
++; CHECK-LABEL: sitofp_v2i64_v2f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vffint.d.l $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %in
++  %v1 = sitofp <2 x i64> %v0 to <2 x double>
++  store <2 x double> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/uitofp.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/uitofp.ll
+new file mode 100644
+index 000000000000..3d4913f12e57
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/uitofp.ll
+@@ -0,0 +1,28 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @uitofp_v4i32_v4f32(ptr %res, ptr %in){
++; CHECK-LABEL: uitofp_v4i32_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vffint.s.wu $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x i32>, ptr %in
++  %v1 = uitofp <4 x i32> %v0 to <4 x float>
++  store <4 x float> %v1, ptr %res
++  ret void
++}
++
++define void @uitofp_v2i64_v2f64(ptr %res, ptr %in){
++; CHECK-LABEL: uitofp_v2i64_v2f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vffint.d.lu $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x i64>, ptr %in
++  %v1 = uitofp <2 x i64> %v0 to <2 x double>
++  store <2 x double> %v1, ptr %res
++  ret void
++}
+-- 
+2.20.1
+
+
+From 0bf1418c5f46ca74dfc8903757b3bb14e0760633 Mon Sep 17 00:00:00 2001
+From: yjijd <licongtian@loongson.cn>
+Date: Tue, 23 Jan 2024 15:57:06 +0800
+Subject: [PATCH 32/35] [CodeGen][LoongArch] Set FP_TO_SINT/FP_TO_UINT to legal
+ for vector types (#79107)
+
+Support the following conversions:
+v4f32->v4i32, v2f64->v2i64(LSX)
+v8f32->v8i32, v4f64->v4i64(LASX)
+v4f32->v4i64, v4f64->v4i32(LASX)
+
+(cherry picked from commit 44ba6ebc999d6e9b27bedfe04a993adfd204dc6a)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       | 12 ++--
+ .../LoongArch/LoongArchLASXInstrInfo.td       | 22 +++++++
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td |  8 +++
+ .../LoongArch/lasx/ir-instruction/fptosi.ll   | 57 +++++++++++++++++++
+ .../LoongArch/lasx/ir-instruction/fptoui.ll   | 57 +++++++++++++++++++
+ .../LoongArch/lsx/ir-instruction/fptosi.ll    | 28 +++++++++
+ .../LoongArch/lsx/ir-instruction/fptoui.ll    | 28 +++++++++
+ 7 files changed, 208 insertions(+), 4 deletions(-)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptosi.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptoui.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptosi.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptoui.ll
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index 7a360b42e15d..f7eacd56c542 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -256,8 +256,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+           {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
+           Expand);
+     }
+-    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP},
+-                       {MVT::v4i32, MVT::v2i64}, Legal);
++    for (MVT VT : {MVT::v4i32, MVT::v2i64}) {
++      setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT, Legal);
++      setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT, Legal);
++    }
+     for (MVT VT : {MVT::v4f32, MVT::v2f64}) {
+       setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
+       setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal);
+@@ -300,8 +302,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+           {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
+           Expand);
+     }
+-    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP},
+-                       {MVT::v8i32, MVT::v4i32, MVT::v4i64}, Legal);
++    for (MVT VT : {MVT::v8i32, MVT::v4i32, MVT::v4i64}) {
++      setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT, Legal);
++      setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT, Legal);
++    }
+     for (MVT VT : {MVT::v8f32, MVT::v4f64}) {
+       setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
+       setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal);
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index b3e74b480922..492b62da6ce7 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -1633,6 +1633,28 @@ def : Pat<(v4f32 (uint_to_fp v4i64:$vj)),
+                                        (XVFFINT_D_LU v4i64:$vj)),
+                           sub_128)>;
+ 
++// XVFTINTRZ_{W_S/L_D}
++def : Pat<(v8i32 (fp_to_sint v8f32:$vj)), (XVFTINTRZ_W_S v8f32:$vj)>;
++def : Pat<(v4i64 (fp_to_sint v4f64:$vj)), (XVFTINTRZ_L_D v4f64:$vj)>;
++def : Pat<(v4i64 (fp_to_sint v4f32:$vj)),
++          (VEXT2XV_D_W (SUBREG_TO_REG (i64 0), (VFTINTRZ_W_S v4f32:$vj),
++                                      sub_128))>;
++def : Pat<(v4i32 (fp_to_sint (v4f64 LASX256:$vj))),
++          (EXTRACT_SUBREG (XVFTINTRZ_W_S (XVFCVT_S_D (XVPERMI_D v4f64:$vj, 238),
++                                                     v4f64:$vj)),
++                          sub_128)>;
++
++// XVFTINTRZ_{W_SU/L_DU}
++def : Pat<(v8i32 (fp_to_uint v8f32:$vj)), (XVFTINTRZ_WU_S v8f32:$vj)>;
++def : Pat<(v4i64 (fp_to_uint v4f64:$vj)), (XVFTINTRZ_LU_D v4f64:$vj)>;
++def : Pat<(v4i64 (fp_to_uint v4f32:$vj)),
++          (VEXT2XV_DU_WU (SUBREG_TO_REG (i64 0), (VFTINTRZ_WU_S v4f32:$vj),
++                                        sub_128))>;
++def : Pat<(v4i32 (fp_to_uint (v4f64 LASX256:$vj))),
++          (EXTRACT_SUBREG (XVFTINTRZ_W_S (XVFCVT_S_D (XVPERMI_D v4f64:$vj, 238),
++                                                     v4f64:$vj)),
++                          sub_128)>;
++
+ } // Predicates = [HasExtLASX]
+ 
+ /// Intrinsic pattern
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index 63eac4d1aeb7..99ac2f3c162f 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -1750,6 +1750,14 @@ def : Pat<(v2f64 (sint_to_fp v2i64:$vj)), (VFFINT_D_L v2i64:$vj)>;
+ def : Pat<(v4f32 (uint_to_fp v4i32:$vj)), (VFFINT_S_WU v4i32:$vj)>;
+ def : Pat<(v2f64 (uint_to_fp v2i64:$vj)), (VFFINT_D_LU v2i64:$vj)>;
+ 
++// VFTINTRZ_{W_S/L_D}
++def : Pat<(v4i32 (fp_to_sint v4f32:$vj)), (VFTINTRZ_W_S v4f32:$vj)>;
++def : Pat<(v2i64 (fp_to_sint v2f64:$vj)), (VFTINTRZ_L_D v2f64:$vj)>;
++
++// VFTINTRZ_{W_SU/L_DU}
++def : Pat<(v4i32 (fp_to_uint v4f32:$vj)), (VFTINTRZ_WU_S v4f32:$vj)>;
++def : Pat<(v2i64 (fp_to_uint v2f64:$vj)), (VFTINTRZ_LU_D v2f64:$vj)>;
++
+ } // Predicates = [HasExtLSX]
+ 
+ /// Intrinsic pattern
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptosi.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptosi.ll
+new file mode 100644
+index 000000000000..0d9f57b57ffa
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptosi.ll
+@@ -0,0 +1,57 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @fptosi_v8f32_v8i32(ptr %res, ptr %in){
++; CHECK-LABEL: fptosi_v8f32_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvftintrz.w.s $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %in
++  %v1 = fptosi <8 x float> %v0 to <8 x i32>
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @fptosi_v4f64_v4i64(ptr %res, ptr %in){
++; CHECK-LABEL: fptosi_v4f64_v4i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvftintrz.l.d $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %in
++  %v1 = fptosi <4 x double> %v0 to <4 x i64>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
++
++define void @fptosi_v4f64_v4i32(ptr %res, ptr %in){
++; CHECK-LABEL: fptosi_v4f64_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 238
++; CHECK-NEXT:    xvfcvt.s.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvftintrz.w.s $xr0, $xr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %in
++  %v1 = fptosi <4 x double> %v0 to <4 x i32>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @fptosi_v4f32_v4i64(ptr %res, ptr %in){
++; CHECK-LABEL: fptosi_v4f32_v4i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vftintrz.w.s $vr0, $vr0
++; CHECK-NEXT:    vext2xv.d.w $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %in
++  %v1 = fptosi <4 x float> %v0 to <4 x i64>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptoui.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptoui.ll
+new file mode 100644
+index 000000000000..27d70f33cd34
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptoui.ll
+@@ -0,0 +1,57 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
++
++define void @fptoui_v8f32_v8i32(ptr %res, ptr %in){
++; CHECK-LABEL: fptoui_v8f32_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvftintrz.wu.s $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <8 x float>, ptr %in
++  %v1 = fptoui <8 x float> %v0 to <8 x i32>
++  store <8 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @fptoui_v4f64_v4i64(ptr %res, ptr %in){
++; CHECK-LABEL: fptoui_v4f64_v4i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvftintrz.lu.d $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %in
++  %v1 = fptoui <4 x double> %v0 to <4 x i64>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
++
++define void @fptoui_v4f64_v4i32(ptr %res, ptr %in){
++; CHECK-LABEL: fptoui_v4f64_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvld $xr0, $a1, 0
++; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 238
++; CHECK-NEXT:    xvfcvt.s.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    xvftintrz.w.s $xr0, $xr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x double>, ptr %in
++  %v1 = fptoui <4 x double> %v0 to <4 x i32>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @fptoui_v4f32_v4i64(ptr %res, ptr %in){
++; CHECK-LABEL: fptoui_v4f32_v4i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vftintrz.wu.s $vr0, $vr0
++; CHECK-NEXT:    vext2xv.du.wu $xr0, $xr0
++; CHECK-NEXT:    xvst $xr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %in
++  %v1 = fptoui <4 x float> %v0 to <4 x i64>
++  store <4 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptosi.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptosi.ll
+new file mode 100644
+index 000000000000..c3008fe96e47
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptosi.ll
+@@ -0,0 +1,28 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @fptosi_v4f32_v4i32(ptr %res, ptr %in){
++; CHECK-LABEL: fptosi_v4f32_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vftintrz.w.s $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %in
++  %v1 = fptosi <4 x float> %v0 to <4 x i32>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @fptosi_v2f64_v2i64(ptr %res, ptr %in){
++; CHECK-LABEL: fptosi_v2f64_v2i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vftintrz.l.d $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %in
++  %v1 = fptosi <2 x double> %v0 to <2 x i64>
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptoui.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptoui.ll
+new file mode 100644
+index 000000000000..f0aeb0bd14e7
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptoui.ll
+@@ -0,0 +1,28 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
++
++define void @fptoui_v4f32_v4i32(ptr %res, ptr %in){
++; CHECK-LABEL: fptoui_v4f32_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vftintrz.wu.s $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <4 x float>, ptr %in
++  %v1 = fptoui <4 x float> %v0 to <4 x i32>
++  store <4 x i32> %v1, ptr %res
++  ret void
++}
++
++define void @fptoui_v2f64_v2i64(ptr %res, ptr %in){
++; CHECK-LABEL: fptoui_v2f64_v2i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vld $vr0, $a1, 0
++; CHECK-NEXT:    vftintrz.lu.d $vr0, $vr0
++; CHECK-NEXT:    vst $vr0, $a0, 0
++; CHECK-NEXT:    ret
++  %v0 = load <2 x double>, ptr %in
++  %v1 = fptoui <2 x double> %v0 to <2 x i64>
++  store <2 x i64> %v1, ptr %res
++  ret void
++}
+-- 
+2.20.1
+
+
+From 66224dcebf8b0cc0d32fa5c73fbb4bca0d885a7d Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Fri, 26 Jan 2024 10:24:07 +0800
+Subject: [PATCH 33/35] [LoongArch] Fixing the incorrect return value of
+ LoongArchTTIImpl::getRegisterBitWidth (#79441)
+
+When we do not enable vector features, we should return the default
+value (`TargetTransformInfoImplBase::getRegisterBitWidth`) instead of
+zero.
+
+This should fix the LoongArch [buildbot
+breakage](https://lab.llvm.org/staging/#/builders/5/builds/486) from
+
+(cherry picked from commit 1e9924c1f248bbddcb95d82a59708d617297dad3)
+(cherry picked from commit 900e7cbfdee09c94d022e4dae923b3c7827f95e3)
+---
+ .../Target/LoongArch/LoongArchTargetTransformInfo.cpp | 11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+index 04349aa52b54..d47dded9ea6e 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+@@ -21,17 +21,20 @@ using namespace llvm;
+ 
+ TypeSize LoongArchTTIImpl::getRegisterBitWidth(
+     TargetTransformInfo::RegisterKind K) const {
++  TypeSize DefSize = TargetTransformInfoImplBase::getRegisterBitWidth(K);
+   switch (K) {
+   case TargetTransformInfo::RGK_Scalar:
+     return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
+   case TargetTransformInfo::RGK_FixedWidthVector:
+-    if (ST->hasExtLASX() && ST->hasExpAutoVec())
++    if (!ST->hasExpAutoVec())
++      return DefSize;
++    if (ST->hasExtLASX())
+       return TypeSize::getFixed(256);
+-    if (ST->hasExtLSX() && ST->hasExpAutoVec())
++    if (ST->hasExtLSX())
+       return TypeSize::getFixed(128);
+-    return TypeSize::getFixed(0);
++    [[fallthrough]];
+   case TargetTransformInfo::RGK_ScalableVector:
+-    return TypeSize::getScalable(0);
++    return DefSize;
+   }
+ 
+   llvm_unreachable("Unsupported register kind");
+-- 
+2.20.1
+
+
+From fe278490b48572e5f5581f35d6b4195f35693b8c Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Tue, 9 Jan 2024 20:32:20 +0800
+Subject: [PATCH 34/35] [LoongArch] Pre-commit test for #76913. NFC
+
+This test will crash with expensive check.
+
+Crash message:
+```
+*** Bad machine code: Using an undefined physical register ***
+- function:    main
+- basic block: %bb.0 entry (0x20fee70)
+- instruction: $r3 = frame-destroy ADDI_D $r22, -288
+- operand 1:   $r22
+```
+
+(cherry picked from commit f499472de3e1184b83fc6cd78bc244a55f2cac7d)
+---
+ .../LoongArch/can-not-realign-stack.ll        | 39 +++++++++++++++++++
+ 1 file changed, 39 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
+
+diff --git a/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll b/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
+new file mode 100644
+index 000000000000..526821076498
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
+@@ -0,0 +1,39 @@
++; REQUIRES: expensive_checks
++; RUN: llc --mtriple=loongarch64 --frame-pointer=none --mattr=+lasx < %s
++
++; XFAIL: *
++
++;; FIXME: This test will crash with expensive check. The subsequent patch will
++;; address and fix this issue.
++
++%struct.S = type { [64 x i16] }
++
++define dso_local noundef signext i32 @main() nounwind {
++entry:
++  %s = alloca %struct.S, align 2
++  call void @llvm.lifetime.start.p0(i64 128, ptr nonnull %s)
++  store <16 x i16> <i16 16384, i16 16129, i16 15874, i16 15619, i16 15364, i16 15109, i16 14854, i16 14599, i16 14344, i16 14089, i16 13834, i16 13579, i16 13324, i16 13069, i16 12814, i16 12559>, ptr %s, align 2
++  %0 = getelementptr inbounds [64 x i16], ptr %s, i64 0, i64 16
++  store <16 x i16> <i16 12304, i16 12049, i16 11794, i16 11539, i16 11284, i16 11029, i16 10774, i16 10519, i16 10264, i16 10009, i16 9754, i16 9499, i16 9244, i16 8989, i16 8734, i16 8479>, ptr %0, align 2
++  %1 = getelementptr inbounds [64 x i16], ptr %s, i64 0, i64 32
++  store <16 x i16> <i16 8224, i16 7969, i16 7714, i16 7459, i16 7204, i16 6949, i16 6694, i16 6439, i16 6184, i16 5929, i16 5674, i16 5419, i16 5164, i16 4909, i16 4654, i16 4399>, ptr %1, align 2
++  %2 = getelementptr inbounds [64 x i16], ptr %s, i64 0, i64 48
++  store <16 x i16> <i16 4144, i16 3889, i16 3634, i16 3379, i16 3124, i16 2869, i16 2614, i16 2359, i16 2104, i16 1849, i16 1594, i16 1339, i16 1084, i16 829, i16 574, i16 319>, ptr %2, align 2
++  call void @foo(ptr noundef nonnull %s)
++  store <16 x i16> <i16 16384, i16 16129, i16 15874, i16 15619, i16 15364, i16 15109, i16 14854, i16 14599, i16 14344, i16 14089, i16 13834, i16 13579, i16 13324, i16 13069, i16 12814, i16 12559>, ptr %s, align 2
++  %3 = getelementptr inbounds [64 x i16], ptr %s, i64 0, i64 16
++  store <16 x i16> <i16 12304, i16 12049, i16 11794, i16 11539, i16 11284, i16 11029, i16 10774, i16 10519, i16 10264, i16 10009, i16 9754, i16 9499, i16 9244, i16 8989, i16 8734, i16 8479>, ptr %3, align 2
++  %4 = getelementptr inbounds [64 x i16], ptr %s, i64 0, i64 32
++  store <16 x i16> <i16 8224, i16 7969, i16 7714, i16 7459, i16 7204, i16 6949, i16 6694, i16 6439, i16 6184, i16 5929, i16 5674, i16 5419, i16 5164, i16 4909, i16 4654, i16 4399>, ptr %4, align 2
++  %5 = getelementptr inbounds [64 x i16], ptr %s, i64 0, i64 48
++  store <16 x i16> <i16 4144, i16 3889, i16 3634, i16 3379, i16 3124, i16 2869, i16 2614, i16 2359, i16 2104, i16 1849, i16 1594, i16 1339, i16 1084, i16 829, i16 574, i16 319>, ptr %5, align 2
++  call void @bar(ptr noundef nonnull %s)
++  call void @llvm.lifetime.end.p0(i64 128, ptr nonnull %s)
++  ret i32 0
++}
++
++declare void @foo(ptr nocapture noundef)
++declare void @bar(ptr nocapture noundef)
++
++declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
++declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
+-- 
+2.20.1
+
+
+From e3e2d0c2cb7cfaffe2663f5f8607dad09fcdf3a5 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Tue, 9 Jan 2024 20:35:49 +0800
+Subject: [PATCH 35/35] [LoongArch] Implement
+ LoongArchRegisterInfo::canRealignStack() (#76913)
+
+This patch fixes the crash issue in the test:
+CodeGen/LoongArch/can-not-realign-stack.ll
+
+Register allocator may spill virtual registers to the stack, which
+introduces stack alignment requirements (when the size of spilled
+    registers exceeds the default alignment size of the stack). If a
+function does not have stack alignment requirements before register
+allocation, registers used for stack alignment will not be preserved.
+
+Therefore, we should implement `canRealignStack()` to inform the
+register allocator whether it is allowed to perform stack realignment
+operations.
+
+(cherry picked from commit 98c6aa72299caeff6b188e1ff2fc1b39c5b893b6)
+---
+ .../LoongArch/LoongArchRegisterInfo.cpp       | 23 ++++++++
+ .../Target/LoongArch/LoongArchRegisterInfo.h  |  1 +
+ .../LoongArch/can-not-realign-stack.ll        | 56 +++++++++++++++++--
+ 3 files changed, 75 insertions(+), 5 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
+index 257b947a3ce4..092b5f1fb442 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp
+@@ -15,6 +15,7 @@
+ #include "LoongArch.h"
+ #include "LoongArchInstrInfo.h"
+ #include "LoongArchSubtarget.h"
++#include "MCTargetDesc/LoongArchBaseInfo.h"
+ #include "MCTargetDesc/LoongArchMCTargetDesc.h"
+ #include "llvm/CodeGen/MachineFrameInfo.h"
+ #include "llvm/CodeGen/MachineFunction.h"
+@@ -194,3 +195,25 @@ bool LoongArchRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+   MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed());
+   return false;
+ }
++
++bool LoongArchRegisterInfo::canRealignStack(const MachineFunction &MF) const {
++  if (!TargetRegisterInfo::canRealignStack(MF))
++    return false;
++
++  const MachineRegisterInfo *MRI = &MF.getRegInfo();
++  const LoongArchFrameLowering *TFI = getFrameLowering(MF);
++
++  // Stack realignment requires a frame pointer.  If we already started
++  // register allocation with frame pointer elimination, it is too late now.
++  if (!MRI->canReserveReg(LoongArch::R22))
++    return false;
++
++  // We may also need a base pointer if there are dynamic allocas or stack
++  // pointer adjustments around calls.
++  if (TFI->hasReservedCallFrame(MF))
++    return true;
++
++  // A base pointer is required and allowed.  Check that it isn't too late to
++  // reserve it.
++  return MRI->canReserveReg(LoongArchABI::getBPReg());
++}
+diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h
+index 7e8f26b14097..d1e40254c297 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h
++++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h
+@@ -51,6 +51,7 @@ struct LoongArchRegisterInfo : public LoongArchGenRegisterInfo {
+   bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
+     return true;
+   }
++  bool canRealignStack(const MachineFunction &MF) const override;
+ };
+ } // end namespace llvm
+ 
+diff --git a/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll b/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
+index 526821076498..af24ae64b7c7 100644
+--- a/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
++++ b/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
+@@ -1,14 +1,60 @@
+-; REQUIRES: expensive_checks
+-; RUN: llc --mtriple=loongarch64 --frame-pointer=none --mattr=+lasx < %s
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --frame-pointer=none --mattr=+lasx < %s | FileCheck %s
+ 
+-; XFAIL: *
++;; This test is checking that when a function allows stack realignment and
++;; realignment needs were not detected before register allocation (at this
++;; point, fp is not preserved), but realignment is required during register
++;; allocation, the stack should not undergo realignment.
+ 
+-;; FIXME: This test will crash with expensive check. The subsequent patch will
+-;; address and fix this issue.
++;; Ensure that the `bstrins.d $sp, $zero, n, 0` instruction is not generated.
++;; n = log2(realign_size) - 1
+ 
+ %struct.S = type { [64 x i16] }
+ 
+ define dso_local noundef signext i32 @main() nounwind {
++; CHECK-LABEL: main:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    addi.d $sp, $sp, -272
++; CHECK-NEXT:    st.d $ra, $sp, 264 # 8-byte Folded Spill
++; CHECK-NEXT:    st.d $fp, $sp, 256 # 8-byte Folded Spill
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_0)
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvst $xr0, $sp, 96 # 32-byte Folded Spill
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_1)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_1)
++; CHECK-NEXT:    xvld $xr1, $a0, 0
++; CHECK-NEXT:    xvst $xr1, $sp, 64 # 32-byte Folded Spill
++; CHECK-NEXT:    xvst $xr1, $sp, 224
++; CHECK-NEXT:    xvst $xr0, $sp, 192
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_2)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_2)
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
++; CHECK-NEXT:    xvst $xr0, $sp, 160
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_3)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_3)
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvst $xr0, $sp, 0 # 32-byte Folded Spill
++; CHECK-NEXT:    xvst $xr0, $sp, 128
++; CHECK-NEXT:    addi.d $fp, $sp, 128
++; CHECK-NEXT:    move $a0, $fp
++; CHECK-NEXT:    bl %plt(foo)
++; CHECK-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
++; CHECK-NEXT:    xvst $xr0, $sp, 224
++; CHECK-NEXT:    xvld $xr0, $sp, 96 # 32-byte Folded Reload
++; CHECK-NEXT:    xvst $xr0, $sp, 192
++; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
++; CHECK-NEXT:    xvst $xr0, $sp, 160
++; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
++; CHECK-NEXT:    xvst $xr0, $sp, 128
++; CHECK-NEXT:    move $a0, $fp
++; CHECK-NEXT:    bl %plt(bar)
++; CHECK-NEXT:    move $a0, $zero
++; CHECK-NEXT:    ld.d $fp, $sp, 256 # 8-byte Folded Reload
++; CHECK-NEXT:    ld.d $ra, $sp, 264 # 8-byte Folded Reload
++; CHECK-NEXT:    addi.d $sp, $sp, 272
++; CHECK-NEXT:    ret
+ entry:
+   %s = alloca %struct.S, align 2
+   call void @llvm.lifetime.start.p0(i64 128, ptr nonnull %s)
+-- 
+2.20.1
+
diff --git a/0012-Backport-LoongArch-improve-the-support-for-compiler-rt-and-bugfix.patch b/0012-Backport-LoongArch-improve-the-support-for-compiler-rt-and-bugfix.patch
new file mode 100644
index 0000000..e40be81
--- /dev/null
+++ b/0012-Backport-LoongArch-improve-the-support-for-compiler-rt-and-bugfix.patch
@@ -0,0 +1,2474 @@
+From 0bce68310dc0ff6a09ec2cf5c3ae32400c631324 Mon Sep 17 00:00:00 2001
+From: zhanglimin <zhanglimin@loongson.cn>
+Date: Tue, 12 Sep 2023 09:51:16 +0800
+Subject: [PATCH 01/14] [sanitizer][msan] VarArgHelper for loongarch64
+
+This patch adds support for variadic argument for loongarch64,
+which is based on MIPS64. And `check-msan` all pass.
+
+Reviewed By: vitalybuka
+
+Differential Revision: https://reviews.llvm.org/D158587
+
+(cherry picked from commit ec42c78cc43ac1e8364e5a0941aa5fc91b813dd3)
+---
+ .../Instrumentation/MemorySanitizer.cpp       |  7 ++
+ .../LoongArch/vararg-loongarch64.ll           | 78 +++++++++++++++++++
+ 2 files changed, 85 insertions(+)
+ create mode 100644 llvm/test/Instrumentation/MemorySanitizer/LoongArch/vararg-loongarch64.ll
+
+diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+index 83d90049abc3..362fd6e4151f 100644
+--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
++++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+@@ -4945,6 +4945,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
+ };
+ 
+ /// MIPS64-specific implementation of VarArgHelper.
++/// NOTE: This is also used for LoongArch64.
+ struct VarArgMIPS64Helper : public VarArgHelper {
+   Function &F;
+   MemorySanitizer &MS;
+@@ -5836,6 +5837,10 @@ struct VarArgSystemZHelper : public VarArgHelper {
+   }
+ };
+ 
++// Loongarch64 is not a MIPS, but the current vargs calling convention matches
++// the MIPS.
++using VarArgLoongArch64Helper = VarArgMIPS64Helper;
++
+ /// A no-op implementation of VarArgHelper.
+ struct VarArgNoOpHelper : public VarArgHelper {
+   VarArgNoOpHelper(Function &F, MemorySanitizer &MS,
+@@ -5868,6 +5873,8 @@ static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+     return new VarArgPowerPC64Helper(Func, Msan, Visitor);
+   else if (TargetTriple.getArch() == Triple::systemz)
+     return new VarArgSystemZHelper(Func, Msan, Visitor);
++  else if (TargetTriple.isLoongArch64())
++    return new VarArgLoongArch64Helper(Func, Msan, Visitor);
+   else
+     return new VarArgNoOpHelper(Func, Msan, Visitor);
+ }
+diff --git a/llvm/test/Instrumentation/MemorySanitizer/LoongArch/vararg-loongarch64.ll b/llvm/test/Instrumentation/MemorySanitizer/LoongArch/vararg-loongarch64.ll
+new file mode 100644
+index 000000000000..8a4ab59588ad
+--- /dev/null
++++ b/llvm/test/Instrumentation/MemorySanitizer/LoongArch/vararg-loongarch64.ll
+@@ -0,0 +1,78 @@
++; RUN: opt < %s -S -passes=msan 2>&1 | FileCheck %s
++
++target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
++target triple = "loongarch64-unknown-linux-gnu"
++
++;; First, check allocation of the save area.
++declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1
++declare void @llvm.va_start(ptr) #2
++declare void @llvm.va_end(ptr) #2
++declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
++define i32 @foo(i32 %guard, ...) {
++; CHECK-LABEL: @foo
++; CHECK:    [[TMP1:%.*]] = load {{.*}} @__msan_va_arg_overflow_size_tls
++; CHECK:    [[TMP2:%.*]] = add i64 0, [[TMP1]]
++; CHECK:    [[TMP3:%.*]] = alloca {{.*}} [[TMP2]]
++; CHECK:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 0, i64 [[TMP2]], i1 false)
++; CHECK:    [[TMP4:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP2]], i64 800)
++; CHECK:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP3]], ptr align 8 @__msan_va_arg_tls, i64 [[TMP4]], i1 false)
++;
++  %vl = alloca ptr, align 8
++  call void @llvm.lifetime.start.p0(i64 32, ptr %vl)
++  call void @llvm.va_start(ptr %vl)
++  call void @llvm.va_end(ptr %vl)
++  call void @llvm.lifetime.end.p0(i64 32, ptr %vl)
++  ret i32 0
++}
++
++;; Save the incoming shadow value from the arguments in the __msan_va_arg_tls
++;; array.
++define i32 @bar() {
++; CHECK-LABEL: @bar
++; CHECK:    store i32 0, ptr @__msan_va_arg_tls, align 8
++; CHECK:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
++; CHECK:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
++; CHECK:    store {{.*}} 24, {{.*}} @__msan_va_arg_overflow_size_tls
++;
++  %1 = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
++  ret i32 %1
++}
++
++;; Check multiple fixed arguments.
++declare i32 @foo2(i32 %g1, i32 %g2, ...)
++define i32 @bar2() {
++; CHECK-LABEL: @bar2
++; CHECK:    store i64 0, ptr @__msan_va_arg_tls, align 8
++; CHECK:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
++; CHECK:    store {{.*}} 16, {{.*}} @__msan_va_arg_overflow_size_tls
++;
++  %1 = call i32 (i32, i32, ...) @foo2(i32 0, i32 1, i64 2, double 3.000000e+00)
++  ret i32 %1
++}
++
++;; Test that MSan doesn't generate code overflowing __msan_va_arg_tls when too many arguments are
++;; passed to a variadic function.
++declare i64 @sum(i64 %n, ...)
++define dso_local i64 @many_args() {
++;; If the size of __msan_va_arg_tls changes the second argument of `add` must also be changed.
++; CHECK-LABEL: @many_args
++; CHECK:    i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 792)
++; CHECK-NOT: i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 800)
++;
++entry:
++  %ret = call i64 (i64, ...) @sum(i64 120,
++  i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1,
++  i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1,
++  i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1,
++  i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1,
++  i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1,
++  i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1,
++  i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1,
++  i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1,
++  i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1,
++  i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1,
++  i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1,
++  i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1
++  )
++  ret i64 %ret
++}
+-- 
+2.20.1
+
+
+From f1265a12fa947b79967552ab520f904486c76353 Mon Sep 17 00:00:00 2001
+From: Ami-zhang <96056515+Ami-zhang@users.noreply.github.com>
+Date: Thu, 28 Sep 2023 15:26:18 +0800
+Subject: [PATCH 02/14] [LowerTypeTests] Add loongarch64 to CFI jumptables
+ (#67312)
+
+This patch implements jump tables for loongarch64.
+
+(cherry picked from commit 0e8a8c85f8765c086c573f36e60c895920381e18)
+---
+ llvm/lib/Transforms/IPO/LowerTypeTests.cpp           | 9 ++++++++-
+ llvm/test/Transforms/LowerTypeTests/function-weak.ll | 2 ++
+ llvm/test/Transforms/LowerTypeTests/function.ll      | 9 +++++++++
+ 3 files changed, 19 insertions(+), 1 deletion(-)
+
+diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+index 9b4b3efd7283..a89d57d12615 100644
+--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
++++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+@@ -1196,6 +1196,7 @@ static const unsigned kARMJumpTableEntrySize = 4;
+ static const unsigned kARMBTIJumpTableEntrySize = 8;
+ static const unsigned kARMv6MJumpTableEntrySize = 16;
+ static const unsigned kRISCVJumpTableEntrySize = 8;
++static const unsigned kLOONGARCH64JumpTableEntrySize = 8;
+ 
+ unsigned LowerTypeTestsModule::getJumpTableEntrySize() {
+   switch (JumpTableArch) {
+@@ -1222,6 +1223,8 @@ unsigned LowerTypeTestsModule::getJumpTableEntrySize() {
+   case Triple::riscv32:
+   case Triple::riscv64:
+     return kRISCVJumpTableEntrySize;
++  case Triple::loongarch64:
++    return kLOONGARCH64JumpTableEntrySize;
+   default:
+     report_fatal_error("Unsupported architecture for jump tables");
+   }
+@@ -1286,6 +1289,9 @@ void LowerTypeTestsModule::createJumpTableEntry(
+   } else if (JumpTableArch == Triple::riscv32 ||
+              JumpTableArch == Triple::riscv64) {
+     AsmOS << "tail $" << ArgIndex << "@plt\n";
++  } else if (JumpTableArch == Triple::loongarch64) {
++    AsmOS << "pcalau12i $$t0, %pc_hi20($" << ArgIndex << ")\n"
++          << "jirl $$r0, $$t0, %pc_lo12($" << ArgIndex << ")\n";
+   } else {
+     report_fatal_error("Unsupported architecture for jump tables");
+   }
+@@ -1304,7 +1310,8 @@ void LowerTypeTestsModule::buildBitSetsFromFunctions(
+     ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
+   if (Arch == Triple::x86 || Arch == Triple::x86_64 || Arch == Triple::arm ||
+       Arch == Triple::thumb || Arch == Triple::aarch64 ||
+-      Arch == Triple::riscv32 || Arch == Triple::riscv64)
++      Arch == Triple::riscv32 || Arch == Triple::riscv64 ||
++      Arch == Triple::loongarch64)
+     buildBitSetsFromFunctionsNative(TypeIds, Functions);
+   else if (Arch == Triple::wasm32 || Arch == Triple::wasm64)
+     buildBitSetsFromFunctionsWASM(TypeIds, Functions);
+diff --git a/llvm/test/Transforms/LowerTypeTests/function-weak.ll b/llvm/test/Transforms/LowerTypeTests/function-weak.ll
+index ff69abacc8e9..c765937f1991 100644
+--- a/llvm/test/Transforms/LowerTypeTests/function-weak.ll
++++ b/llvm/test/Transforms/LowerTypeTests/function-weak.ll
+@@ -4,6 +4,7 @@
+ ; RUN: opt -S -passes=lowertypetests -mtriple=aarch64-unknown-linux-gnu %s | FileCheck --check-prefixes=CHECK,ARM %s
+ ; RUN: opt -S -passes=lowertypetests -mtriple=riscv32-unknown-linux-gnu %s | FileCheck --check-prefixes=CHECK,RISCV %s
+ ; RUN: opt -S -passes=lowertypetests -mtriple=riscv64-unknown-linux-gnu %s | FileCheck --check-prefixes=CHECK,RISCV %s
++; RUN: opt -S -passes=lowertypetests -mtriple=loongarch64-unknown-linux-gnu %s | FileCheck --check-prefixes=CHECK,LOONGARCH64 %s
+ 
+ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+ target triple = "x86_64-unknown-linux-gnu"
+@@ -116,6 +117,7 @@ define i1 @foo(ptr %p) {
+ ; X86: define private void @[[JT]]() #{{.*}} align 8 {
+ ; ARM: define private void @[[JT]]() #{{.*}} align 4 {
+ ; RISCV: define private void @[[JT]]() #{{.*}} align 8 {
++; LOONGARCH64: define private void @[[JT]]() #{{.*}} align 8 {
+ 
+ ; CHECK: define internal void @__cfi_global_var_init() section ".text.startup" {
+ ; CHECK-NEXT: entry:
+diff --git a/llvm/test/Transforms/LowerTypeTests/function.ll b/llvm/test/Transforms/LowerTypeTests/function.ll
+index 968c9d434eb2..802b88d92977 100644
+--- a/llvm/test/Transforms/LowerTypeTests/function.ll
++++ b/llvm/test/Transforms/LowerTypeTests/function.ll
+@@ -5,6 +5,7 @@
+ ; RUN: opt -S -passes=lowertypetests -mtriple=riscv32-unknown-linux-gnu %s | FileCheck --check-prefixes=RISCV,NATIVE %s
+ ; RUN: opt -S -passes=lowertypetests -mtriple=riscv64-unknown-linux-gnu %s | FileCheck --check-prefixes=RISCV,NATIVE %s
+ ; RUN: opt -S -passes=lowertypetests -mtriple=wasm32-unknown-unknown %s | FileCheck --check-prefix=WASM32 %s
++; RUN: opt -S -passes=lowertypetests -mtriple=loongarch64-unknown-linux-gnu %s | FileCheck --check-prefixes=LOONGARCH64,NATIVE %s
+ 
+ ; The right format for Arm jump tables depends on the selected
+ ; subtarget, so we can't get these tests right without the Arm target
+@@ -34,6 +35,7 @@ target datalayout = "e-p:64:64"
+ ; THUMB: @g = internal alias void (), getelementptr inbounds ([2 x [4 x i8]], ptr @[[JT]], i64 0, i64 1)
+ ; THUMBV6M: @g = internal alias void (), getelementptr inbounds ([2 x [16 x i8]], ptr @[[JT]], i64 0, i64 1)
+ ; RISCV: @g = internal alias void (), getelementptr inbounds ([2 x [8 x i8]], ptr @[[JT]], i64 0, i64 1)
++; LOONGARCH64: @g = internal alias void (), getelementptr inbounds ([2 x [8 x i8]], ptr @[[JT]], i64 0, i64 1)
+ 
+ ; NATIVE: define hidden void @f.cfi()
+ ; WASM32: define void @f() !type !{{[0-9]+}} !wasm.index ![[I0:[0-9]+]]
+@@ -65,6 +67,7 @@ define i1 @foo(ptr %p) {
+ ; THUMB:       define private void @[[JT]]() #[[ATTR:.*]] align 4 {
+ ; THUMBV6M:    define private void @[[JT]]() #[[ATTR:.*]] align 16 {
+ ; RISCV:       define private void @[[JT]]() #[[ATTR:.*]] align 8 {
++; LOONGARCH64: define private void @[[JT]]() #[[ATTR:.*]] align 8 {
+ 
+ ; X86:      jmp ${0:c}@plt
+ ; X86-SAME: int3
+@@ -99,6 +102,11 @@ define i1 @foo(ptr %p) {
+ ; RISCV:      tail $0@plt
+ ; RISCV-SAME: tail $1@plt
+ 
++; LOONGARCH64:      pcalau12i $$t0, %pc_hi20($0)
++; LOONGARCH64-SAME: jirl $$r0, $$t0, %pc_lo12($0)
++; LOONGARCH64-SAME: pcalau12i $$t0, %pc_hi20($1)
++; LOONGARCH64-SAME: jirl $$r0, $$t0, %pc_lo12($1)
++
+ ; NATIVE-SAME: "s,s"(ptr @f.cfi, ptr @g.cfi)
+ 
+ ; X86-LINUX: attributes #[[ATTR]] = { naked nocf_check nounwind }
+@@ -107,6 +115,7 @@ define i1 @foo(ptr %p) {
+ ; THUMB: attributes #[[ATTR]] = { naked nounwind "target-cpu"="cortex-a8" "target-features"="+thumb-mode" }
+ ; THUMBV6M: attributes #[[ATTR]] = { naked nounwind "target-features"="+thumb-mode" }
+ ; RISCV: attributes #[[ATTR]] = { naked nounwind "target-features"="-c,-relax" }
++; LOONGARCH64: attributes #[[ATTR]] = { naked nounwind }
+ 
+ ; WASM32: ![[I0]] = !{i64 1}
+ ; WASM32: ![[I1]] = !{i64 2}
+-- 
+2.20.1
+
+
+From 6f3143e1ad0bb759b7519af81994ed3c71dcf52b Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Fri, 20 Oct 2023 10:44:55 +0800
+Subject: [PATCH 03/14] [LoongArch] Fix td pattern for CACOP LDPTE and LDDIR
+
+The immediate argument should be a target constant (`timm`).
+
+(cherry picked from commit 47826b3f148996767ebd2c67ee41c329cb364fef)
+---
+ llvm/lib/Target/LoongArch/LoongArchInstrInfo.td | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+index b2c4bb812ba5..166379d7d592 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+@@ -1857,9 +1857,9 @@ defm : PseudoBinPat<"atomic_load_xor_32", PseudoAtomicLoadXor32>;
+ /// Intrinsics
+ 
+ def : Pat<(int_loongarch_cacop_d timm:$op, i64:$rj, timm:$imm12),
+-          (CACOP uimm5:$op, GPR:$rj, simm12:$imm12)>;
++          (CACOP timm:$op, GPR:$rj, timm:$imm12)>;
+ def : Pat<(int_loongarch_cacop_w i32:$op, i32:$rj, i32:$imm12),
+-          (CACOP uimm5:$op, GPR:$rj, simm12:$imm12)>;
++          (CACOP timm:$op, GPR:$rj, timm:$imm12)>;
+ def : Pat<(loongarch_dbar uimm15:$imm15), (DBAR uimm15:$imm15)>;
+ def : Pat<(loongarch_ibar uimm15:$imm15), (IBAR uimm15:$imm15)>;
+ def : Pat<(loongarch_break uimm15:$imm15), (BREAK uimm15:$imm15)>;
+@@ -2023,9 +2023,9 @@ def : Pat<(int_loongarch_asrtle_d GPR:$rj, GPR:$rk),
+ def : Pat<(int_loongarch_asrtgt_d GPR:$rj, GPR:$rk),
+           (ASRTGT_D GPR:$rj, GPR:$rk)>;
+ def : Pat<(int_loongarch_lddir_d GPR:$rj, timm:$imm8),
+-          (LDDIR GPR:$rj, uimm8:$imm8)>;
++          (LDDIR GPR:$rj, timm:$imm8)>;
+ def : Pat<(int_loongarch_ldpte_d GPR:$rj, timm:$imm8),
+-          (LDPTE GPR:$rj, uimm8:$imm8)>;
++          (LDPTE GPR:$rj, timm:$imm8)>;
+ } // Predicates = [IsLA64]
+ 
+ //===----------------------------------------------------------------------===//
+-- 
+2.20.1
+
+
+From d90b85e94180543fd1789f9e26d7931f2329069b Mon Sep 17 00:00:00 2001
+From: ZhaoQi <zhaoqi01@loongson.cn>
+Date: Fri, 10 Nov 2023 15:54:33 +0800
+Subject: [PATCH 04/14] [LoongArch][MC] Refine MCInstrAnalysis based on
+ registers used (#71276)
+
+MCInstrAnalysis can return properties of instructions (e.g., isCall(),
+isBranch(),...) based on the informations that MCInstrDesc can get from
+*InstrInfo*.td files. These infos are based on opcodes only, but JIRL
+can have different properties based on different registers used.
+
+So this patch refines several MCInstrAnalysis methods: isTerminator,
+isCall,isReturn,isBranch,isUnconditionalBranch and isIndirectBranch.
+
+This patch also allows BOLT which will be supported on LoongArch later
+to get right instruction infos.
+
+(cherry picked from commit f7d784709673ca185f6fb0633fd53c72e81f2ae1)
+---
+ .../MCTargetDesc/LoongArchMCTargetDesc.cpp    |  76 +++++++++++++
+ .../unittests/Target/LoongArch/CMakeLists.txt |   1 +
+ .../Target/LoongArch/MCInstrAnalysisTest.cpp  | 107 ++++++++++++++++++
+ 3 files changed, 184 insertions(+)
+ create mode 100644 llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp
+
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
+index 942e667bc261..d580c3457fec 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
+@@ -104,6 +104,82 @@ public:
+ 
+     return false;
+   }
++
++  bool isTerminator(const MCInst &Inst) const override {
++    if (MCInstrAnalysis::isTerminator(Inst))
++      return true;
++
++    switch (Inst.getOpcode()) {
++    default:
++      return false;
++    case LoongArch::JIRL:
++      return Inst.getOperand(0).getReg() == LoongArch::R0;
++    }
++  }
++
++  bool isCall(const MCInst &Inst) const override {
++    if (MCInstrAnalysis::isCall(Inst))
++      return true;
++
++    switch (Inst.getOpcode()) {
++    default:
++      return false;
++    case LoongArch::JIRL:
++      return Inst.getOperand(0).getReg() != LoongArch::R0;
++    }
++  }
++
++  bool isReturn(const MCInst &Inst) const override {
++    if (MCInstrAnalysis::isReturn(Inst))
++      return true;
++
++    switch (Inst.getOpcode()) {
++    default:
++      return false;
++    case LoongArch::JIRL:
++      return Inst.getOperand(0).getReg() == LoongArch::R0 &&
++             Inst.getOperand(1).getReg() == LoongArch::R1;
++    }
++  }
++
++  bool isBranch(const MCInst &Inst) const override {
++    if (MCInstrAnalysis::isBranch(Inst))
++      return true;
++
++    switch (Inst.getOpcode()) {
++    default:
++      return false;
++    case LoongArch::JIRL:
++      return Inst.getOperand(0).getReg() == LoongArch::R0 &&
++             Inst.getOperand(1).getReg() != LoongArch::R1;
++    }
++  }
++
++  bool isUnconditionalBranch(const MCInst &Inst) const override {
++    if (MCInstrAnalysis::isUnconditionalBranch(Inst))
++      return true;
++
++    switch (Inst.getOpcode()) {
++    default:
++      return false;
++    case LoongArch::JIRL:
++      return Inst.getOperand(0).getReg() == LoongArch::R0 &&
++             Inst.getOperand(1).getReg() != LoongArch::R1;
++    }
++  }
++
++  bool isIndirectBranch(const MCInst &Inst) const override {
++    if (MCInstrAnalysis::isIndirectBranch(Inst))
++      return true;
++
++    switch (Inst.getOpcode()) {
++    default:
++      return false;
++    case LoongArch::JIRL:
++      return Inst.getOperand(0).getReg() == LoongArch::R0 &&
++             Inst.getOperand(1).getReg() != LoongArch::R1;
++    }
++  }
+ };
+ 
+ } // end namespace
+diff --git a/llvm/unittests/Target/LoongArch/CMakeLists.txt b/llvm/unittests/Target/LoongArch/CMakeLists.txt
+index fef4f8e15461..e6f8ec073721 100644
+--- a/llvm/unittests/Target/LoongArch/CMakeLists.txt
++++ b/llvm/unittests/Target/LoongArch/CMakeLists.txt
+@@ -20,6 +20,7 @@ set(LLVM_LINK_COMPONENTS
+ 
+ add_llvm_target_unittest(LoongArchTests
+   InstSizes.cpp
++  MCInstrAnalysisTest.cpp
+   )
+ 
+ set_property(TARGET LoongArchTests PROPERTY FOLDER "Tests/UnitTests/TargetTests")
+diff --git a/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp b/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp
+new file mode 100644
+index 000000000000..6a208d274a0d
+--- /dev/null
++++ b/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp
+@@ -0,0 +1,107 @@
++//===- MCInstrAnalysisTest.cpp - LoongArchMCInstrAnalysis unit tests ------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++
++#include "llvm/MC/MCInstrAnalysis.h"
++#include "MCTargetDesc/LoongArchMCTargetDesc.h"
++#include "llvm/MC/MCInstBuilder.h"
++#include "llvm/MC/TargetRegistry.h"
++#include "llvm/Support/TargetSelect.h"
++
++#include "gtest/gtest.h"
++
++#include <memory>
++
++using namespace llvm;
++
++namespace {
++
++class InstrAnalysisTest : public testing::TestWithParam<const char *> {
++protected:
++  std::unique_ptr<const MCInstrInfo> Info;
++  std::unique_ptr<const MCInstrAnalysis> Analysis;
++
++  static void SetUpTestSuite() {
++    LLVMInitializeLoongArchTargetInfo();
++    LLVMInitializeLoongArchTarget();
++    LLVMInitializeLoongArchTargetMC();
++  }
++
++  InstrAnalysisTest() {
++    std::string Error;
++    const Target *TheTarget =
++        TargetRegistry::lookupTarget(Triple::normalize(GetParam()), Error);
++    Info = std::unique_ptr<const MCInstrInfo>(TheTarget->createMCInstrInfo());
++    Analysis = std::unique_ptr<const MCInstrAnalysis>(
++        TheTarget->createMCInstrAnalysis(Info.get()));
++  }
++};
++
++} // namespace
++
++static MCInst beq() {
++  return MCInstBuilder(LoongArch::BEQ)
++      .addReg(LoongArch::R0)
++      .addReg(LoongArch::R1)
++      .addImm(32);
++}
++
++static MCInst bl() { return MCInstBuilder(LoongArch::BL).addImm(32); }
++
++static MCInst jirl(unsigned RD, unsigned RJ = LoongArch::R10) {
++  return MCInstBuilder(LoongArch::JIRL).addReg(RD).addReg(RJ).addImm(16);
++}
++
++TEST_P(InstrAnalysisTest, IsTerminator) {
++  EXPECT_TRUE(Analysis->isTerminator(beq()));
++  EXPECT_FALSE(Analysis->isTerminator(bl()));
++  EXPECT_TRUE(Analysis->isTerminator(jirl(LoongArch::R0)));
++  EXPECT_FALSE(Analysis->isTerminator(jirl(LoongArch::R5)));
++}
++
++TEST_P(InstrAnalysisTest, IsCall) {
++  EXPECT_FALSE(Analysis->isCall(beq()));
++  EXPECT_TRUE(Analysis->isCall(bl()));
++  EXPECT_TRUE(Analysis->isCall(jirl(LoongArch::R1)));
++  EXPECT_FALSE(Analysis->isCall(jirl(LoongArch::R0)));
++}
++
++TEST_P(InstrAnalysisTest, IsReturn) {
++  EXPECT_FALSE(Analysis->isReturn(beq()));
++  EXPECT_FALSE(Analysis->isReturn(bl()));
++  EXPECT_TRUE(Analysis->isReturn(jirl(LoongArch::R0, LoongArch::R1)));
++  EXPECT_FALSE(Analysis->isReturn(jirl(LoongArch::R0)));
++  EXPECT_FALSE(Analysis->isReturn(jirl(LoongArch::R1)));
++}
++
++TEST_P(InstrAnalysisTest, IsBranch) {
++  EXPECT_TRUE(Analysis->isBranch(beq()));
++  EXPECT_FALSE(Analysis->isBranch(bl()));
++  EXPECT_TRUE(Analysis->isBranch(jirl(LoongArch::R0)));
++  EXPECT_FALSE(Analysis->isBranch(jirl(LoongArch::R1)));
++  EXPECT_FALSE(Analysis->isBranch(jirl(LoongArch::R0, LoongArch::R1)));
++}
++
++TEST_P(InstrAnalysisTest, IsUnconditionalBranch) {
++  EXPECT_FALSE(Analysis->isUnconditionalBranch(beq()));
++  EXPECT_FALSE(Analysis->isUnconditionalBranch(bl()));
++  EXPECT_TRUE(Analysis->isUnconditionalBranch(jirl(LoongArch::R0)));
++  EXPECT_FALSE(Analysis->isUnconditionalBranch(jirl(LoongArch::R1)));
++  EXPECT_FALSE(
++      Analysis->isUnconditionalBranch(jirl(LoongArch::R0, LoongArch::R1)));
++}
++
++TEST_P(InstrAnalysisTest, IsIndirectBranch) {
++  EXPECT_FALSE(Analysis->isIndirectBranch(beq()));
++  EXPECT_FALSE(Analysis->isIndirectBranch(bl()));
++  EXPECT_TRUE(Analysis->isIndirectBranch(jirl(LoongArch::R0)));
++  EXPECT_FALSE(Analysis->isIndirectBranch(jirl(LoongArch::R1)));
++  EXPECT_FALSE(Analysis->isIndirectBranch(jirl(LoongArch::R0, LoongArch::R1)));
++}
++
++INSTANTIATE_TEST_SUITE_P(LA32And64, InstrAnalysisTest,
++                         testing::Values("loongarch32", "loongarch64"));
+-- 
+2.20.1
+
+
+From 4d3ba0892d66b21f6a8a72f1d787e42a64be8867 Mon Sep 17 00:00:00 2001
+From: ZhaoQi <zhaoqi01@loongson.cn>
+Date: Wed, 15 Nov 2023 11:12:30 +0800
+Subject: [PATCH 05/14] [LoongArch][NFC] Pre-commit MCInstrAnalysis tests for
+ instruction 'b' (#71903)
+
+The tests for 'b' which commented with FIXME are incorrect, the
+following patch will fix it.
+
+(cherry picked from commit f6c4bb07eaa94bcd5d02ba7a46850225b6ed50d4)
+---
+ .../Target/LoongArch/MCInstrAnalysisTest.cpp   | 18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
+
+diff --git a/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp b/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp
+index 6a208d274a0d..6e1919fc2261 100644
+--- a/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp
++++ b/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp
+@@ -50,6 +50,8 @@ static MCInst beq() {
+       .addImm(32);
+ }
+ 
++static MCInst b() { return MCInstBuilder(LoongArch::B).addImm(32); }
++
+ static MCInst bl() { return MCInstBuilder(LoongArch::BL).addImm(32); }
+ 
+ static MCInst jirl(unsigned RD, unsigned RJ = LoongArch::R10) {
+@@ -58,6 +60,7 @@ static MCInst jirl(unsigned RD, unsigned RJ = LoongArch::R10) {
+ 
+ TEST_P(InstrAnalysisTest, IsTerminator) {
+   EXPECT_TRUE(Analysis->isTerminator(beq()));
++  EXPECT_TRUE(Analysis->isTerminator(b()));
+   EXPECT_FALSE(Analysis->isTerminator(bl()));
+   EXPECT_TRUE(Analysis->isTerminator(jirl(LoongArch::R0)));
+   EXPECT_FALSE(Analysis->isTerminator(jirl(LoongArch::R5)));
+@@ -65,6 +68,7 @@ TEST_P(InstrAnalysisTest, IsTerminator) {
+ 
+ TEST_P(InstrAnalysisTest, IsCall) {
+   EXPECT_FALSE(Analysis->isCall(beq()));
++  EXPECT_FALSE(Analysis->isCall(b()));
+   EXPECT_TRUE(Analysis->isCall(bl()));
+   EXPECT_TRUE(Analysis->isCall(jirl(LoongArch::R1)));
+   EXPECT_FALSE(Analysis->isCall(jirl(LoongArch::R0)));
+@@ -72,6 +76,7 @@ TEST_P(InstrAnalysisTest, IsCall) {
+ 
+ TEST_P(InstrAnalysisTest, IsReturn) {
+   EXPECT_FALSE(Analysis->isReturn(beq()));
++  EXPECT_FALSE(Analysis->isReturn(b()));
+   EXPECT_FALSE(Analysis->isReturn(bl()));
+   EXPECT_TRUE(Analysis->isReturn(jirl(LoongArch::R0, LoongArch::R1)));
+   EXPECT_FALSE(Analysis->isReturn(jirl(LoongArch::R0)));
+@@ -80,14 +85,26 @@ TEST_P(InstrAnalysisTest, IsReturn) {
+ 
+ TEST_P(InstrAnalysisTest, IsBranch) {
+   EXPECT_TRUE(Analysis->isBranch(beq()));
++  EXPECT_TRUE(Analysis->isBranch(b()));
+   EXPECT_FALSE(Analysis->isBranch(bl()));
+   EXPECT_TRUE(Analysis->isBranch(jirl(LoongArch::R0)));
+   EXPECT_FALSE(Analysis->isBranch(jirl(LoongArch::R1)));
+   EXPECT_FALSE(Analysis->isBranch(jirl(LoongArch::R0, LoongArch::R1)));
+ }
+ 
++TEST_P(InstrAnalysisTest, IsConditionalBranch) {
++  EXPECT_TRUE(Analysis->isConditionalBranch(beq()));
++  // FIXME: Instr 'b' is not a ConditionalBranch, so the analysis here is
++  // wrong. The following patch will fix it.
++  EXPECT_TRUE(Analysis->isConditionalBranch(b()));
++  EXPECT_FALSE(Analysis->isConditionalBranch(bl()));
++}
++
+ TEST_P(InstrAnalysisTest, IsUnconditionalBranch) {
+   EXPECT_FALSE(Analysis->isUnconditionalBranch(beq()));
++  // FIXME: Instr 'b' is an UnconditionalBranch, so the analysis here is
++  // wrong. The following patch will fix it.
++  EXPECT_FALSE(Analysis->isUnconditionalBranch(b()));
+   EXPECT_FALSE(Analysis->isUnconditionalBranch(bl()));
+   EXPECT_TRUE(Analysis->isUnconditionalBranch(jirl(LoongArch::R0)));
+   EXPECT_FALSE(Analysis->isUnconditionalBranch(jirl(LoongArch::R1)));
+@@ -97,6 +114,7 @@ TEST_P(InstrAnalysisTest, IsUnconditionalBranch) {
+ 
+ TEST_P(InstrAnalysisTest, IsIndirectBranch) {
+   EXPECT_FALSE(Analysis->isIndirectBranch(beq()));
++  EXPECT_FALSE(Analysis->isIndirectBranch(b()));
+   EXPECT_FALSE(Analysis->isIndirectBranch(bl()));
+   EXPECT_TRUE(Analysis->isIndirectBranch(jirl(LoongArch::R0)));
+   EXPECT_FALSE(Analysis->isIndirectBranch(jirl(LoongArch::R1)));
+-- 
+2.20.1
+
+
+From 034d4087be71c54248fff1bf7eae66291671776a Mon Sep 17 00:00:00 2001
+From: ZhaoQi <zhaoqi01@loongson.cn>
+Date: Thu, 16 Nov 2023 14:01:58 +0800
+Subject: [PATCH 06/14] [LoongArch] Set isBarrier to true for instruction 'b'
+ (#72339)
+
+Instr "b offs26" represent to an unconditional branch in LoongArch. Set
+isBarrier to 1 in tablegen for it, so that MCInstrAnalysis can return
+correctly.
+
+Fixes https://github.com/llvm/llvm-project/pull/71903.
+
+(cherry picked from commit 42a4d5e8cab1537515d92ed56d6e17b673ed352f)
+---
+ llvm/lib/Target/LoongArch/LoongArchInstrInfo.td         | 1 +
+ llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp | 8 ++------
+ 2 files changed, 3 insertions(+), 6 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+index 166379d7d592..05ae36a9781d 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+@@ -586,6 +586,7 @@ class Br_I26<bits<32> op>
+     : FmtI26<op, (outs), (ins simm26_b:$imm26), "$imm26"> {
+   let isBranch = 1;
+   let isTerminator = 1;
++  let isBarrier = 1;
+ }
+ } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
+ 
+diff --git a/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp b/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp
+index 6e1919fc2261..468ee79615d6 100644
+--- a/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp
++++ b/llvm/unittests/Target/LoongArch/MCInstrAnalysisTest.cpp
+@@ -94,17 +94,13 @@ TEST_P(InstrAnalysisTest, IsBranch) {
+ 
+ TEST_P(InstrAnalysisTest, IsConditionalBranch) {
+   EXPECT_TRUE(Analysis->isConditionalBranch(beq()));
+-  // FIXME: Instr 'b' is not a ConditionalBranch, so the analysis here is
+-  // wrong. The following patch will fix it.
+-  EXPECT_TRUE(Analysis->isConditionalBranch(b()));
++  EXPECT_FALSE(Analysis->isConditionalBranch(b()));
+   EXPECT_FALSE(Analysis->isConditionalBranch(bl()));
+ }
+ 
+ TEST_P(InstrAnalysisTest, IsUnconditionalBranch) {
+   EXPECT_FALSE(Analysis->isUnconditionalBranch(beq()));
+-  // FIXME: Instr 'b' is an UnconditionalBranch, so the analysis here is
+-  // wrong. The following patch will fix it.
+-  EXPECT_FALSE(Analysis->isUnconditionalBranch(b()));
++  EXPECT_TRUE(Analysis->isUnconditionalBranch(b()));
+   EXPECT_FALSE(Analysis->isUnconditionalBranch(bl()));
+   EXPECT_TRUE(Analysis->isUnconditionalBranch(jirl(LoongArch::R0)));
+   EXPECT_FALSE(Analysis->isUnconditionalBranch(jirl(LoongArch::R1)));
+-- 
+2.20.1
+
+
+From 701109dc419b8d07cd5254268d848dee1278b9ad Mon Sep 17 00:00:00 2001
+From: ZhaoQi <zhaoqi01@loongson.cn>
+Date: Tue, 21 Nov 2023 08:34:52 +0800
+Subject: [PATCH 07/14] [LoongArch][MC] Pre-commit tests for instr bl fixupkind
+ testing (#72826)
+
+This patch is used to test whether fixupkind for bl can be returned
+correctly. When BL has target-flags(loongarch-call), there is no error.
+But without this flag, an assertion error will appear. So the test is
+just tagged as "Expectedly Failed" now until the following patch fix it.
+
+(cherry picked from commit 2ca028ce7c6de5f1350440012355a65383b8729a)
+---
+ .../CodeGen/LoongArch/test_bl_fixupkind.mir   | 66 +++++++++++++++++++
+ 1 file changed, 66 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir
+
+diff --git a/llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir b/llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir
+new file mode 100644
+index 000000000000..2c1d41be7711
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir
+@@ -0,0 +1,66 @@
++## Tagged as "Expectedly Failed" until the following patch fix it
++# XFAIL: *
++# RUN: llc --mtriple=loongarch64 --filetype=obj %s -o - | \
++# RUN: llvm-objdump -d - | FileCheck %s
++
++# REQUIRES: asserts
++
++## Check that bl can get fixupkind correctly.
++## When BL has target-flags(loongarch-call), there is no error. But without
++## this flag, an assertion error will appear:
++## Assertion `FixupKind != LoongArch::fixup_loongarch_invalid && "Unhandled expression!"' failed.
++
++--- |
++  target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
++  target triple = "loongarch64"
++  
++  define dso_local void @test_bl_fixupkind_with_flag() {
++  ; CHECK-LABEL: test_bl_fixupkind_with_flag
++  ; CHECK:         addi.d $sp, $sp, -16
++  ; CHECK-NEXT:    st.d $ra, $sp, 8
++  ; CHECK-NEXT:    bl 0 <test_bl_fixupkind_with_flag+0x8>
++  ; CHECK-NEXT:    ld.d $ra, $sp, 8
++  ; CHECK-NEXT:    addi.d $sp, $sp, 16
++  ; CHECK-NEXT:    ret
++  entry:
++    call void @foo()
++    ret void
++  }
++  
++  define dso_local void @test_bl_fixupkind_without_flag() {
++  ; CHECK-LABEL: test_bl_fixupkind_without_flag
++  ; CHECK:         addi.d $sp, $sp, -16
++  ; CHECK-NEXT:    st.d $ra, $sp, 8
++  ; CHECK-NEXT:    bl 0 <test_bl_fixupkind_without_flag+0x8>
++  ; CHECK-NEXT:    ld.d $ra, $sp, 8
++  ; CHECK-NEXT:    addi.d $sp, $sp, 16
++  ; CHECK-NEXT:    ret
++  entry:
++    call void @foo()
++    ret void
++  }
++  
++  declare dso_local void @foo(...)
++...
++---
++name:            test_bl_fixupkind_with_flag
++tracksRegLiveness: true
++body:             |
++  bb.0.entry:
++    ADJCALLSTACKDOWN 0, 0, implicit-def dead $r3, implicit $r3
++    BL target-flags(loongarch-call) @foo, csr_ilp32d_lp64d, implicit-def $r1, implicit-def dead $r1, implicit-def $r3
++    ADJCALLSTACKUP 0, 0, implicit-def dead $r3, implicit $r3
++    PseudoRET
++
++...
++---
++name:            test_bl_fixupkind_without_flag
++tracksRegLiveness: true
++body:             |
++  bb.0.entry:
++    ADJCALLSTACKDOWN 0, 0, implicit-def dead $r3, implicit $r3
++    BL @foo, csr_ilp32d_lp64d, implicit-def $r1, implicit-def dead $r1, implicit-def $r3
++    ADJCALLSTACKUP 0, 0, implicit-def dead $r3, implicit $r3
++    PseudoRET
++
++...
+-- 
+2.20.1
+
+
+From a5bf03107b8738b0fab521d7718bed863056134b Mon Sep 17 00:00:00 2001
+From: ZhaoQi <zhaoqi01@loongson.cn>
+Date: Tue, 21 Nov 2023 19:00:29 +0800
+Subject: [PATCH 08/14] [LoongArch][MC] Support to get the FixupKind for BL
+ (#72938)
+
+Previously, bolt could not get FixupKind for BL correctly, because bolt
+cannot get target-flags for BL. Here just add support in MCCodeEmitter.
+
+Fixes https://github.com/llvm/llvm-project/pull/72826.
+
+(cherry picked from commit 775d2f3201cf7fb657aaf58d1b37c130bd9eb8f9)
+---
+ .../LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp     | 1 +
+ llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir         | 8 ++------
+ 2 files changed, 3 insertions(+), 6 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+index 08c0820cb862..09d92ac9aa3a 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+@@ -263,6 +263,7 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
+       FixupKind = LoongArch::fixup_loongarch_b21;
+       break;
+     case LoongArch::B:
++    case LoongArch::BL:
+       FixupKind = LoongArch::fixup_loongarch_b26;
+       break;
+     }
+diff --git a/llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir b/llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir
+index 2c1d41be7711..70cd5fb8d7eb 100644
+--- a/llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir
++++ b/llvm/test/CodeGen/LoongArch/test_bl_fixupkind.mir
+@@ -1,14 +1,10 @@
+-## Tagged as "Expectedly Failed" until the following patch fix it
+-# XFAIL: *
+ # RUN: llc --mtriple=loongarch64 --filetype=obj %s -o - | \
+ # RUN: llvm-objdump -d - | FileCheck %s
+ 
+ # REQUIRES: asserts
+ 
+-## Check that bl can get fixupkind correctly.
+-## When BL has target-flags(loongarch-call), there is no error. But without
+-## this flag, an assertion error will appear:
+-## Assertion `FixupKind != LoongArch::fixup_loongarch_invalid && "Unhandled expression!"' failed.
++## Check that bl can get fixupkind correctly, whether BL contains
++## target-flags(loongarch-call) or not.
+ 
+ --- |
+   target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
+-- 
+2.20.1
+
+
+From 20421e57af53d963a95c6c318f71f9399d241188 Mon Sep 17 00:00:00 2001
+From: ZhaoQi <zhaoqi01@loongson.cn>
+Date: Thu, 23 Nov 2023 16:38:41 +0800
+Subject: [PATCH 09/14] [LoongArch][MC] Modify branch evaluation for
+ MCInstrAnalysis (#73205)
+
+Function evaluateBranch() is used to compute target address for a given
+branch instruction and return true on success. But target address of
+indirect branch cannot be simply added, so rule it out and just return
+false.
+
+This patch also add objdump tests which capture the current state of
+support for printing branch targets. Without this patch, the result of
+"jirl $zero, $a0, 4" is "jirl $zero, $a0, 4 <foo+0x64>". It is obviously
+incorrect, because this instruction represents an indirect branch whose
+target address depends on both the register value and the imm. After
+this patch, it will be right despite loss of details.
+
+(cherry picked from commit 1c68c4c57a65a67963264878bc4646be8b58854c)
+---
+ .../MCTargetDesc/LoongArchMCTargetDesc.cpp    |  3 +-
+ .../llvm-objdump/ELF/LoongArch/branches.s     | 76 +++++++++++++++++++
+ .../llvm-objdump/ELF/LoongArch/lit.local.cfg  |  2 +
+ 3 files changed, 80 insertions(+), 1 deletion(-)
+ create mode 100644 llvm/test/tools/llvm-objdump/ELF/LoongArch/branches.s
+ create mode 100644 llvm/test/tools/llvm-objdump/ELF/LoongArch/lit.local.cfg
+
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
+index d580c3457fec..a4e6a09863e6 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
+@@ -97,7 +97,8 @@ public:
+   bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+                       uint64_t &Target) const override {
+     unsigned NumOps = Inst.getNumOperands();
+-    if (isBranch(Inst) || Inst.getOpcode() == LoongArch::BL) {
++    if ((isBranch(Inst) && !isIndirectBranch(Inst)) ||
++        Inst.getOpcode() == LoongArch::BL) {
+       Target = Addr + Inst.getOperand(NumOps - 1).getImm();
+       return true;
+     }
+diff --git a/llvm/test/tools/llvm-objdump/ELF/LoongArch/branches.s b/llvm/test/tools/llvm-objdump/ELF/LoongArch/branches.s
+new file mode 100644
+index 000000000000..8cb00aef9954
+--- /dev/null
++++ b/llvm/test/tools/llvm-objdump/ELF/LoongArch/branches.s
+@@ -0,0 +1,76 @@
++# RUN: llvm-mc --triple=loongarch32 --filetype=obj < %s | \
++# RUN:   llvm-objdump -d --no-show-raw-insn - | FileCheck %s
++# RUN: llvm-mc --triple=loongarch64 --filetype=obj < %s | \
++# RUN:   llvm-objdump -d --no-show-raw-insn - | FileCheck %s
++
++# CHECK-LABEL: <foo>:
++foo:
++# CHECK: beq $a0, $a1, 108 <foo+0x6c>
++beq $a0, $a1, .Llocal
++# CHECK: bne $a0, $a1, 104 <foo+0x6c>
++bne $a0, $a1, .Llocal
++# CHECK: blt $a0, $a1, 100 <foo+0x6c>
++blt $a0, $a1, .Llocal
++# CHECK: bltu $a0, $a1, 96 <foo+0x6c>
++bltu $a0, $a1, .Llocal
++# CHECK: bge $a0, $a1, 92 <foo+0x6c>
++bge $a0, $a1, .Llocal
++# CHECK: bgeu $a0, $a1, 88 <foo+0x6c>
++bgeu $a0, $a1, .Llocal
++# CHECK: beqz $a0, 84 <foo+0x6c>
++beqz $a0, .Llocal
++# CHECK: bnez $a0, 80 <foo+0x6c>
++bnez $a0, .Llocal
++# CHECK: bceqz $fcc6, 76 <foo+0x6c>
++bceqz $fcc6, .Llocal
++# CHECK: bcnez $fcc6, 72 <foo+0x6c>
++bcnez $fcc6, .Llocal
++
++# CHECK: beq $a0, $a1, 76 <bar>
++beq $a0, $a1, bar
++# CHECK: bne $a0, $a1, 72 <bar>
++bne $a0, $a1, bar
++# CHECK: blt $a0, $a1, 68 <bar>
++blt $a0, $a1, bar
++# CHECK: bltu $a0, $a1, 64 <bar>
++bltu $a0, $a1, bar
++# CHECK: bge $a0, $a1, 60 <bar>
++bge $a0, $a1, bar
++# CHECK: bgeu $a0, $a1, 56 <bar>
++bgeu $a0, $a1, bar
++# CHECK: beqz $a0, 52 <bar>
++beqz $a0, bar
++# CHECK: bnez $a0, 48 <bar>
++bnez $a0, bar
++# CHECK: bceqz $fcc6, 44 <bar>
++bceqz $fcc6, bar
++# CHECK: bcnez $fcc6, 40 <bar>
++bcnez $fcc6, bar
++
++# CHECK: b 28 <foo+0x6c>
++b .Llocal
++# CHECK: b 32 <bar>
++b bar
++
++# CHECK: bl 20 <foo+0x6c>
++bl .Llocal
++# CHECK: bl 24 <bar>
++bl bar
++
++# CHECK: jirl $zero, $a0, 4{{$}}
++jirl $zero, $a0, 4
++# CHECK: jirl $ra, $a0, 4{{$}}
++jirl $ra, $a0, 4
++# CHECK: ret
++ret
++
++.Llocal:
++# CHECK: 6c: nop
++# CHECK: nop
++nop
++nop
++
++# CHECK-LABEL: <bar>:
++bar:
++# CHECK: 74: nop
++nop
+diff --git a/llvm/test/tools/llvm-objdump/ELF/LoongArch/lit.local.cfg b/llvm/test/tools/llvm-objdump/ELF/LoongArch/lit.local.cfg
+new file mode 100644
+index 000000000000..cc24278acbb4
+--- /dev/null
++++ b/llvm/test/tools/llvm-objdump/ELF/LoongArch/lit.local.cfg
+@@ -0,0 +1,2 @@
++if not "LoongArch" in config.root.targets:
++    config.unsupported = True
+-- 
+2.20.1
+
+
+From 0fe85205a8637c6671f423cddd41b712085232ac Mon Sep 17 00:00:00 2001
+From: hev <wangrui@loongson.cn>
+Date: Thu, 23 Nov 2023 15:15:26 +0800
+Subject: [PATCH 10/14] [LoongArch] Precommit a test for smul with overflow
+ (NFC) (#73212)
+
+(cherry picked from commit 7414c0db962f8a5029fd44c3e0bc93d9ce20be71)
+---
+ .../CodeGen/LoongArch/smul-with-overflow.ll   | 118 ++++++++++++++++++
+ 1 file changed, 118 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
+
+diff --git a/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll b/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
+new file mode 100644
+index 000000000000..a53e77e5aa4b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
+@@ -0,0 +1,118 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch32 < %s | FileCheck %s --check-prefix=LA32
++; RUN: llc --mtriple=loongarch64 < %s | FileCheck %s --check-prefix=LA64
++
++define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
++; LA32-LABEL: smuloi64:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    .cfi_def_cfa_offset 16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
++; LA32-NEXT:    .cfi_offset 1, -4
++; LA32-NEXT:    .cfi_offset 22, -8
++; LA32-NEXT:    move $fp, $a4
++; LA32-NEXT:    st.w $zero, $sp, 4
++; LA32-NEXT:    addi.w $a4, $sp, 4
++; LA32-NEXT:    bl %plt(__mulodi4)
++; LA32-NEXT:    st.w $a1, $fp, 4
++; LA32-NEXT:    st.w $a0, $fp, 0
++; LA32-NEXT:    ld.w $a0, $sp, 4
++; LA32-NEXT:    sltu $a0, $zero, $a0
++; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: smuloi64:
++; LA64:       # %bb.0:
++; LA64-NEXT:    mul.d $a3, $a0, $a1
++; LA64-NEXT:    st.d $a3, $a2, 0
++; LA64-NEXT:    mulh.d $a0, $a0, $a1
++; LA64-NEXT:    srai.d $a1, $a3, 63
++; LA64-NEXT:    xor $a0, $a0, $a1
++; LA64-NEXT:    sltu $a0, $zero, $a0
++; LA64-NEXT:    ret
++  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
++  %val = extractvalue {i64, i1} %t, 0
++  %obit = extractvalue {i64, i1} %t, 1
++  store i64 %val, ptr %res
++  ret i1 %obit
++}
++
++define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
++; LA32-LABEL: smuloi128:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -64
++; LA32-NEXT:    .cfi_def_cfa_offset 64
++; LA32-NEXT:    st.w $ra, $sp, 60 # 4-byte Folded Spill
++; LA32-NEXT:    st.w $fp, $sp, 56 # 4-byte Folded Spill
++; LA32-NEXT:    .cfi_offset 1, -4
++; LA32-NEXT:    .cfi_offset 22, -8
++; LA32-NEXT:    move $fp, $a2
++; LA32-NEXT:    st.w $zero, $sp, 52
++; LA32-NEXT:    ld.w $a2, $a1, 12
++; LA32-NEXT:    st.w $a2, $sp, 12
++; LA32-NEXT:    ld.w $a2, $a1, 8
++; LA32-NEXT:    st.w $a2, $sp, 8
++; LA32-NEXT:    ld.w $a2, $a1, 4
++; LA32-NEXT:    st.w $a2, $sp, 4
++; LA32-NEXT:    ld.w $a1, $a1, 0
++; LA32-NEXT:    st.w $a1, $sp, 0
++; LA32-NEXT:    ld.w $a1, $a0, 12
++; LA32-NEXT:    st.w $a1, $sp, 28
++; LA32-NEXT:    ld.w $a1, $a0, 8
++; LA32-NEXT:    st.w $a1, $sp, 24
++; LA32-NEXT:    ld.w $a1, $a0, 4
++; LA32-NEXT:    st.w $a1, $sp, 20
++; LA32-NEXT:    ld.w $a0, $a0, 0
++; LA32-NEXT:    st.w $a0, $sp, 16
++; LA32-NEXT:    addi.w $a0, $sp, 32
++; LA32-NEXT:    addi.w $a1, $sp, 16
++; LA32-NEXT:    addi.w $a2, $sp, 0
++; LA32-NEXT:    addi.w $a3, $sp, 52
++; LA32-NEXT:    bl %plt(__muloti4)
++; LA32-NEXT:    ld.w $a0, $sp, 44
++; LA32-NEXT:    st.w $a0, $fp, 12
++; LA32-NEXT:    ld.w $a0, $sp, 40
++; LA32-NEXT:    st.w $a0, $fp, 8
++; LA32-NEXT:    ld.w $a0, $sp, 36
++; LA32-NEXT:    st.w $a0, $fp, 4
++; LA32-NEXT:    ld.w $a0, $sp, 32
++; LA32-NEXT:    st.w $a0, $fp, 0
++; LA32-NEXT:    ld.w $a0, $sp, 52
++; LA32-NEXT:    sltu $a0, $zero, $a0
++; LA32-NEXT:    ld.w $fp, $sp, 56 # 4-byte Folded Reload
++; LA32-NEXT:    ld.w $ra, $sp, 60 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 64
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: smuloi128:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.d $sp, $sp, -32
++; LA64-NEXT:    .cfi_def_cfa_offset 32
++; LA64-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
++; LA64-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
++; LA64-NEXT:    .cfi_offset 1, -8
++; LA64-NEXT:    .cfi_offset 22, -16
++; LA64-NEXT:    move $fp, $a4
++; LA64-NEXT:    st.d $zero, $sp, 8
++; LA64-NEXT:    addi.d $a4, $sp, 8
++; LA64-NEXT:    bl %plt(__muloti4)
++; LA64-NEXT:    st.d $a1, $fp, 8
++; LA64-NEXT:    st.d $a0, $fp, 0
++; LA64-NEXT:    ld.d $a0, $sp, 8
++; LA64-NEXT:    sltu $a0, $zero, $a0
++; LA64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
++; LA64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
++; LA64-NEXT:    addi.d $sp, $sp, 32
++; LA64-NEXT:    ret
++  %t = call {i128, i1} @llvm.smul.with.overflow.i128(i128 %v1, i128 %v2)
++  %val = extractvalue {i128, i1} %t, 0
++  %obit = extractvalue {i128, i1} %t, 1
++  store i128 %val, ptr %res
++  ret i1 %obit
++}
++
++declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
++declare {i128, i1} @llvm.smul.with.overflow.i128(i128, i128) nounwind readnone
+-- 
+2.20.1
+
+
+From e29ff285726046ec46c9005c67ba992e3efc8ace Mon Sep 17 00:00:00 2001
+From: hev <wangrui@loongson.cn>
+Date: Thu, 23 Nov 2023 19:34:50 +0800
+Subject: [PATCH 11/14] [LoongArch] Disable mulodi4 and muloti4 libcalls
+ (#73199)
+
+This library function only exists in compiler-rt not libgcc. So this
+would fail to link unless we were linking with compiler-rt.
+
+Fixes https://github.com/ClangBuiltLinux/linux/issues/1958
+
+(cherry picked from commit 0d9f557b6c36da3aa92daff4c0d37ea821d7ae1e)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       |   5 +
+ .../CodeGen/LoongArch/smul-with-overflow.ll   | 463 +++++++++++++++---
+ 2 files changed, 397 insertions(+), 71 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index f7eacd56c542..ed106cb766bc 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -152,8 +152,13 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+ 
+     // Set libcalls.
+     setLibcallName(RTLIB::MUL_I128, nullptr);
++    // The MULO libcall is not part of libgcc, only compiler-rt.
++    setLibcallName(RTLIB::MULO_I64, nullptr);
+   }
+ 
++  // The MULO libcall is not part of libgcc, only compiler-rt.
++  setLibcallName(RTLIB::MULO_I128, nullptr);
++
+   static const ISD::CondCode FPCCToExpand[] = {
+       ISD::SETOGT, ISD::SETOGE, ISD::SETUGT, ISD::SETUGE,
+       ISD::SETGE,  ISD::SETNE,  ISD::SETGT};
+diff --git a/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll b/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
+index a53e77e5aa4b..6cba4108d63c 100644
+--- a/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
++++ b/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
+@@ -5,23 +5,53 @@
+ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
+ ; LA32-LABEL: smuloi64:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:    addi.w $sp, $sp, -16
+-; LA32-NEXT:    .cfi_def_cfa_offset 16
+-; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+-; LA32-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
+-; LA32-NEXT:    .cfi_offset 1, -4
+-; LA32-NEXT:    .cfi_offset 22, -8
+-; LA32-NEXT:    move $fp, $a4
+-; LA32-NEXT:    st.w $zero, $sp, 4
+-; LA32-NEXT:    addi.w $a4, $sp, 4
+-; LA32-NEXT:    bl %plt(__mulodi4)
+-; LA32-NEXT:    st.w $a1, $fp, 4
+-; LA32-NEXT:    st.w $a0, $fp, 0
+-; LA32-NEXT:    ld.w $a0, $sp, 4
++; LA32-NEXT:    srai.w $a5, $a1, 31
++; LA32-NEXT:    mul.w $a6, $a2, $a5
++; LA32-NEXT:    mulh.wu $a7, $a2, $a5
++; LA32-NEXT:    add.w $a7, $a7, $a6
++; LA32-NEXT:    mul.w $a5, $a3, $a5
++; LA32-NEXT:    add.w $a5, $a7, $a5
++; LA32-NEXT:    srai.w $a7, $a3, 31
++; LA32-NEXT:    mul.w $t0, $a7, $a1
++; LA32-NEXT:    mulh.wu $t1, $a7, $a0
++; LA32-NEXT:    add.w $t0, $t1, $t0
++; LA32-NEXT:    mul.w $a7, $a7, $a0
++; LA32-NEXT:    add.w $t0, $t0, $a7
++; LA32-NEXT:    add.w $a5, $t0, $a5
++; LA32-NEXT:    mulh.wu $t0, $a0, $a2
++; LA32-NEXT:    mul.w $t1, $a1, $a2
++; LA32-NEXT:    add.w $t0, $t1, $t0
++; LA32-NEXT:    sltu $t1, $t0, $t1
++; LA32-NEXT:    mulh.wu $t2, $a1, $a2
++; LA32-NEXT:    add.w $t1, $t2, $t1
++; LA32-NEXT:    mul.w $t2, $a0, $a3
++; LA32-NEXT:    add.w $t0, $t2, $t0
++; LA32-NEXT:    sltu $t2, $t0, $t2
++; LA32-NEXT:    mulh.wu $t3, $a0, $a3
++; LA32-NEXT:    add.w $t2, $t3, $t2
++; LA32-NEXT:    add.w $a6, $a7, $a6
++; LA32-NEXT:    sltu $a7, $a6, $a7
++; LA32-NEXT:    add.w $a5, $a5, $a7
++; LA32-NEXT:    mul.w $a0, $a0, $a2
++; LA32-NEXT:    mul.w $a2, $a1, $a3
++; LA32-NEXT:    mulh.wu $a1, $a1, $a3
++; LA32-NEXT:    add.w $a3, $t1, $t2
++; LA32-NEXT:    sltu $a7, $a3, $t1
++; LA32-NEXT:    add.w $a1, $a1, $a7
++; LA32-NEXT:    st.w $a0, $a4, 0
++; LA32-NEXT:    add.w $a0, $a2, $a3
++; LA32-NEXT:    sltu $a2, $a0, $a2
++; LA32-NEXT:    add.w $a1, $a1, $a2
++; LA32-NEXT:    st.w $t0, $a4, 4
++; LA32-NEXT:    add.w $a1, $a1, $a5
++; LA32-NEXT:    add.w $a2, $a0, $a6
++; LA32-NEXT:    sltu $a0, $a2, $a0
++; LA32-NEXT:    add.w $a0, $a1, $a0
++; LA32-NEXT:    srai.w $a1, $t0, 31
++; LA32-NEXT:    xor $a0, $a0, $a1
++; LA32-NEXT:    xor $a1, $a2, $a1
++; LA32-NEXT:    or $a0, $a1, $a0
+ ; LA32-NEXT:    sltu $a0, $zero, $a0
+-; LA32-NEXT:    ld.w $fp, $sp, 8 # 4-byte Folded Reload
+-; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+-; LA32-NEXT:    addi.w $sp, $sp, 16
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: smuloi64:
+@@ -43,69 +73,360 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
+ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
+ ; LA32-LABEL: smuloi128:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:    addi.w $sp, $sp, -64
+-; LA32-NEXT:    .cfi_def_cfa_offset 64
+-; LA32-NEXT:    st.w $ra, $sp, 60 # 4-byte Folded Spill
+-; LA32-NEXT:    st.w $fp, $sp, 56 # 4-byte Folded Spill
++; LA32-NEXT:    addi.w $sp, $sp, -96
++; LA32-NEXT:    .cfi_def_cfa_offset 96
++; LA32-NEXT:    st.w $ra, $sp, 92 # 4-byte Folded Spill
++; LA32-NEXT:    st.w $fp, $sp, 88 # 4-byte Folded Spill
++; LA32-NEXT:    st.w $s0, $sp, 84 # 4-byte Folded Spill
++; LA32-NEXT:    st.w $s1, $sp, 80 # 4-byte Folded Spill
++; LA32-NEXT:    st.w $s2, $sp, 76 # 4-byte Folded Spill
++; LA32-NEXT:    st.w $s3, $sp, 72 # 4-byte Folded Spill
++; LA32-NEXT:    st.w $s4, $sp, 68 # 4-byte Folded Spill
++; LA32-NEXT:    st.w $s5, $sp, 64 # 4-byte Folded Spill
++; LA32-NEXT:    st.w $s6, $sp, 60 # 4-byte Folded Spill
++; LA32-NEXT:    st.w $s7, $sp, 56 # 4-byte Folded Spill
++; LA32-NEXT:    st.w $s8, $sp, 52 # 4-byte Folded Spill
+ ; LA32-NEXT:    .cfi_offset 1, -4
+ ; LA32-NEXT:    .cfi_offset 22, -8
+-; LA32-NEXT:    move $fp, $a2
+-; LA32-NEXT:    st.w $zero, $sp, 52
+-; LA32-NEXT:    ld.w $a2, $a1, 12
+-; LA32-NEXT:    st.w $a2, $sp, 12
+-; LA32-NEXT:    ld.w $a2, $a1, 8
+-; LA32-NEXT:    st.w $a2, $sp, 8
+-; LA32-NEXT:    ld.w $a2, $a1, 4
+-; LA32-NEXT:    st.w $a2, $sp, 4
+-; LA32-NEXT:    ld.w $a1, $a1, 0
+-; LA32-NEXT:    st.w $a1, $sp, 0
+-; LA32-NEXT:    ld.w $a1, $a0, 12
+-; LA32-NEXT:    st.w $a1, $sp, 28
+-; LA32-NEXT:    ld.w $a1, $a0, 8
+-; LA32-NEXT:    st.w $a1, $sp, 24
+-; LA32-NEXT:    ld.w $a1, $a0, 4
+-; LA32-NEXT:    st.w $a1, $sp, 20
+-; LA32-NEXT:    ld.w $a0, $a0, 0
+-; LA32-NEXT:    st.w $a0, $sp, 16
+-; LA32-NEXT:    addi.w $a0, $sp, 32
+-; LA32-NEXT:    addi.w $a1, $sp, 16
+-; LA32-NEXT:    addi.w $a2, $sp, 0
+-; LA32-NEXT:    addi.w $a3, $sp, 52
+-; LA32-NEXT:    bl %plt(__muloti4)
+-; LA32-NEXT:    ld.w $a0, $sp, 44
+-; LA32-NEXT:    st.w $a0, $fp, 12
+-; LA32-NEXT:    ld.w $a0, $sp, 40
+-; LA32-NEXT:    st.w $a0, $fp, 8
+-; LA32-NEXT:    ld.w $a0, $sp, 36
+-; LA32-NEXT:    st.w $a0, $fp, 4
+-; LA32-NEXT:    ld.w $a0, $sp, 32
+-; LA32-NEXT:    st.w $a0, $fp, 0
+-; LA32-NEXT:    ld.w $a0, $sp, 52
++; LA32-NEXT:    .cfi_offset 23, -12
++; LA32-NEXT:    .cfi_offset 24, -16
++; LA32-NEXT:    .cfi_offset 25, -20
++; LA32-NEXT:    .cfi_offset 26, -24
++; LA32-NEXT:    .cfi_offset 27, -28
++; LA32-NEXT:    .cfi_offset 28, -32
++; LA32-NEXT:    .cfi_offset 29, -36
++; LA32-NEXT:    .cfi_offset 30, -40
++; LA32-NEXT:    .cfi_offset 31, -44
++; LA32-NEXT:    st.w $a2, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ld.w $a6, $a1, 0
++; LA32-NEXT:    ld.w $a7, $a0, 0
++; LA32-NEXT:    mulh.wu $a3, $a7, $a6
++; LA32-NEXT:    ld.w $a5, $a0, 4
++; LA32-NEXT:    mul.w $a4, $a5, $a6
++; LA32-NEXT:    add.w $a3, $a4, $a3
++; LA32-NEXT:    sltu $a4, $a3, $a4
++; LA32-NEXT:    mulh.wu $t0, $a5, $a6
++; LA32-NEXT:    add.w $a4, $t0, $a4
++; LA32-NEXT:    ld.w $t0, $a1, 4
++; LA32-NEXT:    mul.w $t1, $a7, $t0
++; LA32-NEXT:    add.w $a3, $t1, $a3
++; LA32-NEXT:    st.w $a3, $sp, 44 # 4-byte Folded Spill
++; LA32-NEXT:    sltu $t1, $a3, $t1
++; LA32-NEXT:    mulh.wu $t2, $a7, $t0
++; LA32-NEXT:    add.w $t1, $t2, $t1
++; LA32-NEXT:    ld.w $t4, $a0, 12
++; LA32-NEXT:    ld.w $t2, $a0, 8
++; LA32-NEXT:    ld.w $t3, $a1, 8
++; LA32-NEXT:    mulh.wu $a0, $t2, $t3
++; LA32-NEXT:    mul.w $t5, $t4, $t3
++; LA32-NEXT:    add.w $a0, $t5, $a0
++; LA32-NEXT:    sltu $t5, $a0, $t5
++; LA32-NEXT:    mulh.wu $t6, $t4, $t3
++; LA32-NEXT:    add.w $t5, $t6, $t5
++; LA32-NEXT:    ld.w $t7, $a1, 12
++; LA32-NEXT:    mul.w $a1, $t2, $t7
++; LA32-NEXT:    add.w $a0, $a1, $a0
++; LA32-NEXT:    st.w $a0, $sp, 48 # 4-byte Folded Spill
++; LA32-NEXT:    sltu $a1, $a0, $a1
++; LA32-NEXT:    mulh.wu $t6, $t2, $t7
++; LA32-NEXT:    add.w $t6, $t6, $a1
++; LA32-NEXT:    srai.w $s7, $t4, 31
++; LA32-NEXT:    mul.w $a1, $s7, $t7
++; LA32-NEXT:    mulh.wu $t8, $s7, $t3
++; LA32-NEXT:    add.w $t8, $t8, $a1
++; LA32-NEXT:    mulh.wu $fp, $a6, $s7
++; LA32-NEXT:    mul.w $s6, $t0, $s7
++; LA32-NEXT:    add.w $s8, $s6, $fp
++; LA32-NEXT:    mul.w $a1, $a6, $s7
++; LA32-NEXT:    add.w $ra, $a1, $s8
++; LA32-NEXT:    sltu $s0, $ra, $a1
++; LA32-NEXT:    add.w $a0, $fp, $s0
++; LA32-NEXT:    add.w $a3, $a4, $t1
++; LA32-NEXT:    st.w $a3, $sp, 20 # 4-byte Folded Spill
++; LA32-NEXT:    sltu $a4, $a3, $a4
++; LA32-NEXT:    mulh.wu $t1, $a5, $t0
++; LA32-NEXT:    add.w $a3, $t1, $a4
++; LA32-NEXT:    st.w $a3, $sp, 28 # 4-byte Folded Spill
++; LA32-NEXT:    srai.w $s4, $t7, 31
++; LA32-NEXT:    mul.w $fp, $a7, $s4
++; LA32-NEXT:    mulh.wu $a4, $a7, $s4
++; LA32-NEXT:    add.w $s1, $a4, $fp
++; LA32-NEXT:    sltu $s0, $s1, $fp
++; LA32-NEXT:    add.w $s5, $a4, $s0
++; LA32-NEXT:    mul.w $a4, $s7, $t3
++; LA32-NEXT:    add.w $t8, $t8, $a4
++; LA32-NEXT:    add.w $s0, $ra, $t8
++; LA32-NEXT:    add.w $a3, $a1, $a4
++; LA32-NEXT:    st.w $a3, $sp, 32 # 4-byte Folded Spill
++; LA32-NEXT:    sltu $a4, $a3, $a1
++; LA32-NEXT:    add.w $a3, $s0, $a4
++; LA32-NEXT:    st.w $a3, $sp, 24 # 4-byte Folded Spill
++; LA32-NEXT:    add.w $s3, $t5, $t6
++; LA32-NEXT:    sltu $a4, $s3, $t5
++; LA32-NEXT:    mulh.wu $t5, $t4, $t7
++; LA32-NEXT:    add.w $a3, $t5, $a4
++; LA32-NEXT:    st.w $a3, $sp, 16 # 4-byte Folded Spill
++; LA32-NEXT:    mul.w $a4, $a7, $a6
++; LA32-NEXT:    st.w $a4, $a2, 0
++; LA32-NEXT:    sltu $a4, $s8, $s6
++; LA32-NEXT:    mulh.wu $t5, $t0, $s7
++; LA32-NEXT:    add.w $a4, $t5, $a4
++; LA32-NEXT:    add.w $t1, $a4, $a0
++; LA32-NEXT:    sltu $a4, $t1, $a4
++; LA32-NEXT:    add.w $s2, $t5, $a4
++; LA32-NEXT:    mulh.wu $a4, $a7, $t3
++; LA32-NEXT:    mul.w $t5, $a5, $t3
++; LA32-NEXT:    add.w $a4, $t5, $a4
++; LA32-NEXT:    sltu $t5, $a4, $t5
++; LA32-NEXT:    mulh.wu $t6, $a5, $t3
++; LA32-NEXT:    add.w $a3, $t6, $t5
++; LA32-NEXT:    mul.w $t6, $a7, $t7
++; LA32-NEXT:    add.w $t5, $t6, $a4
++; LA32-NEXT:    sltu $a4, $t5, $t6
++; LA32-NEXT:    mulh.wu $t6, $a7, $t7
++; LA32-NEXT:    add.w $a4, $t6, $a4
++; LA32-NEXT:    mulh.wu $t6, $t2, $a6
++; LA32-NEXT:    mul.w $s7, $t4, $a6
++; LA32-NEXT:    add.w $t6, $s7, $t6
++; LA32-NEXT:    sltu $s7, $t6, $s7
++; LA32-NEXT:    mulh.wu $s8, $t4, $a6
++; LA32-NEXT:    add.w $a0, $s8, $s7
++; LA32-NEXT:    mul.w $s7, $t2, $t0
++; LA32-NEXT:    add.w $t6, $s7, $t6
++; LA32-NEXT:    sltu $s7, $t6, $s7
++; LA32-NEXT:    mulh.wu $s8, $t2, $t0
++; LA32-NEXT:    add.w $a2, $s8, $s7
++; LA32-NEXT:    mul.w $s8, $a5, $s4
++; LA32-NEXT:    add.w $s7, $s1, $s8
++; LA32-NEXT:    add.w $s1, $s7, $ra
++; LA32-NEXT:    add.w $a1, $fp, $a1
++; LA32-NEXT:    st.w $a1, $sp, 40 # 4-byte Folded Spill
++; LA32-NEXT:    sltu $ra, $a1, $fp
++; LA32-NEXT:    add.w $a1, $s1, $ra
++; LA32-NEXT:    st.w $a1, $sp, 36 # 4-byte Folded Spill
++; LA32-NEXT:    xor $s0, $a1, $s7
++; LA32-NEXT:    sltui $s0, $s0, 1
++; LA32-NEXT:    sltu $a1, $a1, $s7
++; LA32-NEXT:    masknez $s1, $a1, $s0
++; LA32-NEXT:    maskeqz $s0, $ra, $s0
++; LA32-NEXT:    add.w $t1, $s6, $t1
++; LA32-NEXT:    sltu $s6, $t1, $s6
++; LA32-NEXT:    add.w $s2, $s2, $s6
++; LA32-NEXT:    add.w $a2, $a0, $a2
++; LA32-NEXT:    sltu $a0, $a2, $a0
++; LA32-NEXT:    mulh.wu $s6, $t4, $t0
++; LA32-NEXT:    add.w $t8, $s6, $a0
++; LA32-NEXT:    add.w $a4, $a3, $a4
++; LA32-NEXT:    sltu $a3, $a4, $a3
++; LA32-NEXT:    mulh.wu $s6, $a5, $t7
++; LA32-NEXT:    add.w $a3, $s6, $a3
++; LA32-NEXT:    mul.w $s6, $t4, $t7
++; LA32-NEXT:    mul.w $t7, $a5, $t7
++; LA32-NEXT:    mul.w $ra, $t4, $t0
++; LA32-NEXT:    mul.w $t0, $a5, $t0
++; LA32-NEXT:    mul.w $t4, $t4, $s4
++; LA32-NEXT:    mul.w $a7, $a7, $t3
++; LA32-NEXT:    mul.w $a6, $t2, $a6
++; LA32-NEXT:    mul.w $t3, $t2, $t3
++; LA32-NEXT:    mul.w $a0, $t2, $s4
++; LA32-NEXT:    mulh.wu $t2, $t2, $s4
++; LA32-NEXT:    mulh.wu $a5, $s4, $a5
++; LA32-NEXT:    sltu $s4, $s7, $s8
++; LA32-NEXT:    add.w $s4, $a5, $s4
++; LA32-NEXT:    add.w $s4, $s5, $s4
++; LA32-NEXT:    sltu $s5, $s4, $s5
++; LA32-NEXT:    add.w $s5, $a5, $s5
++; LA32-NEXT:    ld.w $a1, $sp, 20 # 4-byte Folded Reload
++; LA32-NEXT:    add.w $a1, $t0, $a1
++; LA32-NEXT:    sltu $a5, $a1, $t0
++; LA32-NEXT:    ld.w $t0, $sp, 28 # 4-byte Folded Reload
++; LA32-NEXT:    add.w $t0, $t0, $a5
++; LA32-NEXT:    or $s0, $s0, $s1
++; LA32-NEXT:    add.w $a4, $t7, $a4
++; LA32-NEXT:    sltu $a5, $a4, $t7
++; LA32-NEXT:    add.w $t7, $a3, $a5
++; LA32-NEXT:    add.w $s1, $ra, $a2
++; LA32-NEXT:    sltu $a2, $s1, $ra
++; LA32-NEXT:    add.w $t8, $t8, $a2
++; LA32-NEXT:    add.w $a5, $s6, $s3
++; LA32-NEXT:    sltu $a2, $a5, $s6
++; LA32-NEXT:    ld.w $a3, $sp, 16 # 4-byte Folded Reload
++; LA32-NEXT:    add.w $a2, $a3, $a2
++; LA32-NEXT:    ld.w $s6, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    ld.w $a3, $sp, 44 # 4-byte Folded Reload
++; LA32-NEXT:    st.w $a3, $s6, 4
++; LA32-NEXT:    ld.w $a3, $sp, 24 # 4-byte Folded Reload
++; LA32-NEXT:    add.w $a3, $s2, $a3
++; LA32-NEXT:    ld.w $s2, $sp, 32 # 4-byte Folded Reload
++; LA32-NEXT:    add.w $s2, $t1, $s2
++; LA32-NEXT:    sltu $t1, $s2, $t1
++; LA32-NEXT:    add.w $a3, $a3, $t1
++; LA32-NEXT:    add.w $t1, $s8, $s4
++; LA32-NEXT:    sltu $s3, $t1, $s8
++; LA32-NEXT:    add.w $s3, $s5, $s3
++; LA32-NEXT:    add.w $t2, $t2, $a0
++; LA32-NEXT:    add.w $t2, $t2, $t4
++; LA32-NEXT:    add.w $t2, $t2, $s7
++; LA32-NEXT:    add.w $t4, $a0, $fp
++; LA32-NEXT:    sltu $a0, $t4, $a0
++; LA32-NEXT:    add.w $a0, $t2, $a0
++; LA32-NEXT:    add.w $a0, $s3, $a0
++; LA32-NEXT:    add.w $t2, $t1, $t4
++; LA32-NEXT:    sltu $t1, $t2, $t1
++; LA32-NEXT:    add.w $a0, $a0, $t1
++; LA32-NEXT:    add.w $a0, $a0, $a3
++; LA32-NEXT:    add.w $t1, $t2, $s2
++; LA32-NEXT:    sltu $a3, $t1, $t2
++; LA32-NEXT:    add.w $a0, $a0, $a3
++; LA32-NEXT:    add.w $a3, $t6, $t0
++; LA32-NEXT:    add.w $a1, $a6, $a1
++; LA32-NEXT:    sltu $a6, $a1, $a6
++; LA32-NEXT:    add.w $t0, $a3, $a6
++; LA32-NEXT:    add.w $a1, $a7, $a1
++; LA32-NEXT:    sltu $a7, $a1, $a7
++; LA32-NEXT:    add.w $a3, $t5, $t0
++; LA32-NEXT:    add.w $a3, $a3, $a7
++; LA32-NEXT:    sltu $t2, $a3, $t5
++; LA32-NEXT:    xor $t4, $a3, $t5
++; LA32-NEXT:    sltui $t4, $t4, 1
++; LA32-NEXT:    masknez $t2, $t2, $t4
++; LA32-NEXT:    maskeqz $a7, $a7, $t4
++; LA32-NEXT:    st.w $a1, $s6, 8
++; LA32-NEXT:    or $a1, $a7, $t2
++; LA32-NEXT:    sltu $a7, $t0, $t6
++; LA32-NEXT:    xor $t0, $t0, $t6
++; LA32-NEXT:    sltui $t0, $t0, 1
++; LA32-NEXT:    masknez $a7, $a7, $t0
++; LA32-NEXT:    maskeqz $a6, $a6, $t0
++; LA32-NEXT:    or $a6, $a6, $a7
++; LA32-NEXT:    add.w $a6, $s1, $a6
++; LA32-NEXT:    sltu $a7, $a6, $s1
++; LA32-NEXT:    add.w $a7, $t8, $a7
++; LA32-NEXT:    add.w $a1, $a4, $a1
++; LA32-NEXT:    sltu $a4, $a1, $a4
++; LA32-NEXT:    add.w $a4, $t7, $a4
++; LA32-NEXT:    add.w $t0, $t1, $s0
++; LA32-NEXT:    sltu $t1, $t0, $t1
++; LA32-NEXT:    add.w $a0, $a0, $t1
++; LA32-NEXT:    st.w $a3, $s6, 12
++; LA32-NEXT:    add.w $a1, $a6, $a1
++; LA32-NEXT:    sltu $a6, $a1, $a6
++; LA32-NEXT:    add.w $a4, $a7, $a4
++; LA32-NEXT:    add.w $a4, $a4, $a6
++; LA32-NEXT:    sltu $t1, $a4, $a7
++; LA32-NEXT:    xor $a7, $a4, $a7
++; LA32-NEXT:    sltui $a7, $a7, 1
++; LA32-NEXT:    masknez $t1, $t1, $a7
++; LA32-NEXT:    maskeqz $a6, $a6, $a7
++; LA32-NEXT:    or $a6, $a6, $t1
++; LA32-NEXT:    add.w $a6, $a5, $a6
++; LA32-NEXT:    sltu $a5, $a6, $a5
++; LA32-NEXT:    add.w $a2, $a2, $a5
++; LA32-NEXT:    ld.w $t1, $sp, 48 # 4-byte Folded Reload
++; LA32-NEXT:    add.w $a4, $t1, $a4
++; LA32-NEXT:    add.w $a1, $t3, $a1
++; LA32-NEXT:    sltu $a5, $a1, $t3
++; LA32-NEXT:    add.w $a4, $a4, $a5
++; LA32-NEXT:    sltu $a7, $a4, $t1
++; LA32-NEXT:    xor $t1, $a4, $t1
++; LA32-NEXT:    sltui $t1, $t1, 1
++; LA32-NEXT:    masknez $a7, $a7, $t1
++; LA32-NEXT:    maskeqz $a5, $a5, $t1
++; LA32-NEXT:    or $a5, $a5, $a7
++; LA32-NEXT:    add.w $a5, $a6, $a5
++; LA32-NEXT:    sltu $a6, $a5, $a6
++; LA32-NEXT:    add.w $a2, $a2, $a6
++; LA32-NEXT:    add.w $a0, $a2, $a0
++; LA32-NEXT:    add.w $a2, $a5, $t0
++; LA32-NEXT:    sltu $a5, $a2, $a5
++; LA32-NEXT:    add.w $a0, $a0, $a5
++; LA32-NEXT:    ld.w $a5, $sp, 40 # 4-byte Folded Reload
++; LA32-NEXT:    add.w $a5, $a1, $a5
++; LA32-NEXT:    sltu $a1, $a5, $a1
++; LA32-NEXT:    ld.w $a6, $sp, 36 # 4-byte Folded Reload
++; LA32-NEXT:    add.w $a6, $a4, $a6
++; LA32-NEXT:    add.w $a6, $a6, $a1
++; LA32-NEXT:    sltu $a7, $a6, $a4
++; LA32-NEXT:    xor $a4, $a6, $a4
++; LA32-NEXT:    sltui $a4, $a4, 1
++; LA32-NEXT:    masknez $a7, $a7, $a4
++; LA32-NEXT:    maskeqz $a1, $a1, $a4
++; LA32-NEXT:    or $a1, $a1, $a7
++; LA32-NEXT:    add.w $a1, $a2, $a1
++; LA32-NEXT:    sltu $a2, $a1, $a2
++; LA32-NEXT:    add.w $a0, $a0, $a2
++; LA32-NEXT:    srai.w $a2, $a3, 31
++; LA32-NEXT:    xor $a3, $a6, $a2
++; LA32-NEXT:    xor $a0, $a0, $a2
++; LA32-NEXT:    or $a0, $a3, $a0
++; LA32-NEXT:    xor $a3, $a5, $a2
++; LA32-NEXT:    xor $a1, $a1, $a2
++; LA32-NEXT:    or $a1, $a3, $a1
++; LA32-NEXT:    or $a0, $a1, $a0
+ ; LA32-NEXT:    sltu $a0, $zero, $a0
+-; LA32-NEXT:    ld.w $fp, $sp, 56 # 4-byte Folded Reload
+-; LA32-NEXT:    ld.w $ra, $sp, 60 # 4-byte Folded Reload
+-; LA32-NEXT:    addi.w $sp, $sp, 64
++; LA32-NEXT:    ld.w $s8, $sp, 52 # 4-byte Folded Reload
++; LA32-NEXT:    ld.w $s7, $sp, 56 # 4-byte Folded Reload
++; LA32-NEXT:    ld.w $s6, $sp, 60 # 4-byte Folded Reload
++; LA32-NEXT:    ld.w $s5, $sp, 64 # 4-byte Folded Reload
++; LA32-NEXT:    ld.w $s4, $sp, 68 # 4-byte Folded Reload
++; LA32-NEXT:    ld.w $s3, $sp, 72 # 4-byte Folded Reload
++; LA32-NEXT:    ld.w $s2, $sp, 76 # 4-byte Folded Reload
++; LA32-NEXT:    ld.w $s1, $sp, 80 # 4-byte Folded Reload
++; LA32-NEXT:    ld.w $s0, $sp, 84 # 4-byte Folded Reload
++; LA32-NEXT:    ld.w $fp, $sp, 88 # 4-byte Folded Reload
++; LA32-NEXT:    ld.w $ra, $sp, 92 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 96
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: smuloi128:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:    addi.d $sp, $sp, -32
+-; LA64-NEXT:    .cfi_def_cfa_offset 32
+-; LA64-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+-; LA64-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+-; LA64-NEXT:    .cfi_offset 1, -8
+-; LA64-NEXT:    .cfi_offset 22, -16
+-; LA64-NEXT:    move $fp, $a4
+-; LA64-NEXT:    st.d $zero, $sp, 8
+-; LA64-NEXT:    addi.d $a4, $sp, 8
+-; LA64-NEXT:    bl %plt(__muloti4)
+-; LA64-NEXT:    st.d $a1, $fp, 8
+-; LA64-NEXT:    st.d $a0, $fp, 0
+-; LA64-NEXT:    ld.d $a0, $sp, 8
++; LA64-NEXT:    srai.d $a5, $a1, 63
++; LA64-NEXT:    mul.d $a6, $a2, $a5
++; LA64-NEXT:    mulh.du $a7, $a2, $a5
++; LA64-NEXT:    add.d $a7, $a7, $a6
++; LA64-NEXT:    mul.d $a5, $a3, $a5
++; LA64-NEXT:    add.d $a5, $a7, $a5
++; LA64-NEXT:    srai.d $a7, $a3, 63
++; LA64-NEXT:    mul.d $t0, $a7, $a1
++; LA64-NEXT:    mulh.du $t1, $a7, $a0
++; LA64-NEXT:    add.d $t0, $t1, $t0
++; LA64-NEXT:    mul.d $a7, $a7, $a0
++; LA64-NEXT:    add.d $t0, $t0, $a7
++; LA64-NEXT:    add.d $a5, $t0, $a5
++; LA64-NEXT:    mulh.du $t0, $a0, $a2
++; LA64-NEXT:    mul.d $t1, $a1, $a2
++; LA64-NEXT:    add.d $t0, $t1, $t0
++; LA64-NEXT:    sltu $t1, $t0, $t1
++; LA64-NEXT:    mulh.du $t2, $a1, $a2
++; LA64-NEXT:    add.d $t1, $t2, $t1
++; LA64-NEXT:    mul.d $t2, $a0, $a3
++; LA64-NEXT:    add.d $t0, $t2, $t0
++; LA64-NEXT:    sltu $t2, $t0, $t2
++; LA64-NEXT:    mulh.du $t3, $a0, $a3
++; LA64-NEXT:    add.d $t2, $t3, $t2
++; LA64-NEXT:    add.d $a6, $a7, $a6
++; LA64-NEXT:    sltu $a7, $a6, $a7
++; LA64-NEXT:    add.d $a5, $a5, $a7
++; LA64-NEXT:    mul.d $a0, $a0, $a2
++; LA64-NEXT:    mul.d $a2, $a1, $a3
++; LA64-NEXT:    mulh.du $a1, $a1, $a3
++; LA64-NEXT:    add.d $a3, $t1, $t2
++; LA64-NEXT:    sltu $a7, $a3, $t1
++; LA64-NEXT:    add.d $a1, $a1, $a7
++; LA64-NEXT:    st.d $a0, $a4, 0
++; LA64-NEXT:    add.d $a0, $a2, $a3
++; LA64-NEXT:    sltu $a2, $a0, $a2
++; LA64-NEXT:    add.d $a1, $a1, $a2
++; LA64-NEXT:    st.d $t0, $a4, 8
++; LA64-NEXT:    add.d $a1, $a1, $a5
++; LA64-NEXT:    add.d $a2, $a0, $a6
++; LA64-NEXT:    sltu $a0, $a2, $a0
++; LA64-NEXT:    add.d $a0, $a1, $a0
++; LA64-NEXT:    srai.d $a1, $t0, 63
++; LA64-NEXT:    xor $a0, $a0, $a1
++; LA64-NEXT:    xor $a1, $a2, $a1
++; LA64-NEXT:    or $a0, $a1, $a0
+ ; LA64-NEXT:    sltu $a0, $zero, $a0
+-; LA64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+-; LA64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+-; LA64-NEXT:    addi.d $sp, $sp, 32
+ ; LA64-NEXT:    ret
+   %t = call {i128, i1} @llvm.smul.with.overflow.i128(i128 %v1, i128 %v2)
+   %val = extractvalue {i128, i1} %t, 0
+-- 
+2.20.1
+
+
+From 01ced6193e2abfbd50fbd9d40066cf27f9f9067b Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Wed, 29 Nov 2023 15:21:21 +0800
+Subject: [PATCH 12/14] [LoongArch] Fix pattern for FNMSUB_{S/D} instructions
+ (#73742)
+
+```
+when a=c=-0.0, b=0.0:
+-(a * b + (-c)) = -0.0
+-a * b + c = 0.0
+(fneg (fma a, b (-c))) != (fma (fneg a), b ,c)
+```
+
+See https://reviews.llvm.org/D90901 for a similar discussion on X86.
+
+(cherry picked from commit 5e7e0d603204ede803323a825318e365a87f73e9)
+---
+ .../LoongArch/LoongArchFloat32InstrInfo.td    |   8 +-
+ .../LoongArch/LoongArchFloat64InstrInfo.td    |   6 +-
+ llvm/test/CodeGen/LoongArch/double-fma.ll     | 259 ++++++++++++++++--
+ llvm/test/CodeGen/LoongArch/float-fma.ll      | 259 ++++++++++++++++--
+ 4 files changed, 483 insertions(+), 49 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+index 826db54febd3..65120c083f49 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+@@ -294,8 +294,12 @@ def : Pat<(fneg (fma FPR32:$fj, FPR32:$fk, FPR32:$fa)),
+ def : Pat<(fma_nsz (fneg FPR32:$fj), FPR32:$fk, (fneg FPR32:$fa)),
+           (FNMADD_S FPR32:$fj, FPR32:$fk, FPR32:$fa)>;
+ 
+-// fnmsub.s: -fj * fk + fa
+-def : Pat<(fma (fneg FPR32:$fj), FPR32:$fk, FPR32:$fa),
++// fnmsub.s: -(fj * fk - fa)
++def : Pat<(fneg (fma FPR32:$fj, FPR32:$fk, (fneg FPR32:$fa))),
++          (FNMSUB_S FPR32:$fj, FPR32:$fk, FPR32:$fa)>;
++
++// fnmsub.s: -fj * fk + fa (the nsz flag on the FMA)
++def : Pat<(fma_nsz (fneg FPR32:$fj), FPR32:$fk, FPR32:$fa),
+           (FNMSUB_S FPR32:$fj, FPR32:$fk, FPR32:$fa)>;
+ } // Predicates = [HasBasicF]
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+index 5118474725b6..437c1e4d7be2 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+@@ -256,7 +256,11 @@ def : Pat<(fma_nsz (fneg FPR64:$fj), FPR64:$fk, (fneg FPR64:$fa)),
+           (FNMADD_D FPR64:$fj, FPR64:$fk, FPR64:$fa)>;
+ 
+ // fnmsub.d: -(fj * fk - fa)
+-def : Pat<(fma (fneg FPR64:$fj), FPR64:$fk, FPR64:$fa),
++def : Pat<(fneg (fma FPR64:$fj, FPR64:$fk, (fneg FPR64:$fa))),
++          (FNMSUB_D FPR64:$fj, FPR64:$fk, FPR64:$fa)>;
++
++// fnmsub.d: -fj * fk + fa (the nsz flag on the FMA)
++def : Pat<(fma_nsz (fneg FPR64:$fj), FPR64:$fk, FPR64:$fa),
+           (FNMSUB_D FPR64:$fj, FPR64:$fk, FPR64:$fa)>;
+ } // Predicates = [HasBasicD]
+ 
+diff --git a/llvm/test/CodeGen/LoongArch/double-fma.ll b/llvm/test/CodeGen/LoongArch/double-fma.ll
+index 6dd628479433..58d20c62a668 100644
+--- a/llvm/test/CodeGen/LoongArch/double-fma.ll
++++ b/llvm/test/CodeGen/LoongArch/double-fma.ll
+@@ -236,13 +236,15 @@ define double @fnmsub_d(double %a, double %b, double %c) nounwind {
+ ; LA32-CONTRACT-ON-LABEL: fnmsub_d:
+ ; LA32-CONTRACT-ON:       # %bb.0:
+ ; LA32-CONTRACT-ON-NEXT:    fmul.d $fa0, $fa0, $fa1
+-; LA32-CONTRACT-ON-NEXT:    fsub.d $fa0, $fa2, $fa0
++; LA32-CONTRACT-ON-NEXT:    fsub.d $fa0, $fa0, $fa2
++; LA32-CONTRACT-ON-NEXT:    fneg.d $fa0, $fa0
+ ; LA32-CONTRACT-ON-NEXT:    ret
+ ;
+ ; LA32-CONTRACT-OFF-LABEL: fnmsub_d:
+ ; LA32-CONTRACT-OFF:       # %bb.0:
+ ; LA32-CONTRACT-OFF-NEXT:    fmul.d $fa0, $fa0, $fa1
+-; LA32-CONTRACT-OFF-NEXT:    fsub.d $fa0, $fa2, $fa0
++; LA32-CONTRACT-OFF-NEXT:    fsub.d $fa0, $fa0, $fa2
++; LA32-CONTRACT-OFF-NEXT:    fneg.d $fa0, $fa0
+ ; LA32-CONTRACT-OFF-NEXT:    ret
+ ;
+ ; LA64-CONTRACT-FAST-LABEL: fnmsub_d:
+@@ -253,12 +255,98 @@ define double @fnmsub_d(double %a, double %b, double %c) nounwind {
+ ; LA64-CONTRACT-ON-LABEL: fnmsub_d:
+ ; LA64-CONTRACT-ON:       # %bb.0:
+ ; LA64-CONTRACT-ON-NEXT:    fmul.d $fa0, $fa0, $fa1
+-; LA64-CONTRACT-ON-NEXT:    fsub.d $fa0, $fa2, $fa0
++; LA64-CONTRACT-ON-NEXT:    fsub.d $fa0, $fa0, $fa2
++; LA64-CONTRACT-ON-NEXT:    fneg.d $fa0, $fa0
+ ; LA64-CONTRACT-ON-NEXT:    ret
+ ;
+ ; LA64-CONTRACT-OFF-LABEL: fnmsub_d:
+ ; LA64-CONTRACT-OFF:       # %bb.0:
+ ; LA64-CONTRACT-OFF-NEXT:    fmul.d $fa0, $fa0, $fa1
++; LA64-CONTRACT-OFF-NEXT:    fsub.d $fa0, $fa0, $fa2
++; LA64-CONTRACT-OFF-NEXT:    fneg.d $fa0, $fa0
++; LA64-CONTRACT-OFF-NEXT:    ret
++  %negc = fneg double %c
++  %mul = fmul double %a, %b
++  %add = fadd double %mul, %negc
++  %neg = fneg double %add
++  ret double %neg
++}
++
++define double @fnmsub_d_nsz(double %a, double %b, double %c) nounwind {
++; LA32-CONTRACT-FAST-LABEL: fnmsub_d_nsz:
++; LA32-CONTRACT-FAST:       # %bb.0:
++; LA32-CONTRACT-FAST-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-FAST-NEXT:    ret
++;
++; LA32-CONTRACT-ON-LABEL: fnmsub_d_nsz:
++; LA32-CONTRACT-ON:       # %bb.0:
++; LA32-CONTRACT-ON-NEXT:    fmul.d $fa0, $fa0, $fa1
++; LA32-CONTRACT-ON-NEXT:    fsub.d $fa0, $fa2, $fa0
++; LA32-CONTRACT-ON-NEXT:    ret
++;
++; LA32-CONTRACT-OFF-LABEL: fnmsub_d_nsz:
++; LA32-CONTRACT-OFF:       # %bb.0:
++; LA32-CONTRACT-OFF-NEXT:    fmul.d $fa0, $fa0, $fa1
++; LA32-CONTRACT-OFF-NEXT:    fsub.d $fa0, $fa2, $fa0
++; LA32-CONTRACT-OFF-NEXT:    ret
++;
++; LA64-CONTRACT-FAST-LABEL: fnmsub_d_nsz:
++; LA64-CONTRACT-FAST:       # %bb.0:
++; LA64-CONTRACT-FAST-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-FAST-NEXT:    ret
++;
++; LA64-CONTRACT-ON-LABEL: fnmsub_d_nsz:
++; LA64-CONTRACT-ON:       # %bb.0:
++; LA64-CONTRACT-ON-NEXT:    fmul.d $fa0, $fa0, $fa1
++; LA64-CONTRACT-ON-NEXT:    fsub.d $fa0, $fa2, $fa0
++; LA64-CONTRACT-ON-NEXT:    ret
++;
++; LA64-CONTRACT-OFF-LABEL: fnmsub_d_nsz:
++; LA64-CONTRACT-OFF:       # %bb.0:
++; LA64-CONTRACT-OFF-NEXT:    fmul.d $fa0, $fa0, $fa1
++; LA64-CONTRACT-OFF-NEXT:    fsub.d $fa0, $fa2, $fa0
++; LA64-CONTRACT-OFF-NEXT:    ret
++  %nega = fneg nsz double %a
++  %mul = fmul nsz double %nega, %b
++  %add = fadd nsz double %mul, %c
++  ret double %add
++}
++
++;; Check that fnmsub.d is not emitted.
++define double @not_fnmsub_d(double %a, double %b, double %c) nounwind {
++; LA32-CONTRACT-FAST-LABEL: not_fnmsub_d:
++; LA32-CONTRACT-FAST:       # %bb.0:
++; LA32-CONTRACT-FAST-NEXT:    fneg.d $fa0, $fa0
++; LA32-CONTRACT-FAST-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-FAST-NEXT:    ret
++;
++; LA32-CONTRACT-ON-LABEL: not_fnmsub_d:
++; LA32-CONTRACT-ON:       # %bb.0:
++; LA32-CONTRACT-ON-NEXT:    fmul.d $fa0, $fa0, $fa1
++; LA32-CONTRACT-ON-NEXT:    fsub.d $fa0, $fa2, $fa0
++; LA32-CONTRACT-ON-NEXT:    ret
++;
++; LA32-CONTRACT-OFF-LABEL: not_fnmsub_d:
++; LA32-CONTRACT-OFF:       # %bb.0:
++; LA32-CONTRACT-OFF-NEXT:    fmul.d $fa0, $fa0, $fa1
++; LA32-CONTRACT-OFF-NEXT:    fsub.d $fa0, $fa2, $fa0
++; LA32-CONTRACT-OFF-NEXT:    ret
++;
++; LA64-CONTRACT-FAST-LABEL: not_fnmsub_d:
++; LA64-CONTRACT-FAST:       # %bb.0:
++; LA64-CONTRACT-FAST-NEXT:    fneg.d $fa0, $fa0
++; LA64-CONTRACT-FAST-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-FAST-NEXT:    ret
++;
++; LA64-CONTRACT-ON-LABEL: not_fnmsub_d:
++; LA64-CONTRACT-ON:       # %bb.0:
++; LA64-CONTRACT-ON-NEXT:    fmul.d $fa0, $fa0, $fa1
++; LA64-CONTRACT-ON-NEXT:    fsub.d $fa0, $fa2, $fa0
++; LA64-CONTRACT-ON-NEXT:    ret
++;
++; LA64-CONTRACT-OFF-LABEL: not_fnmsub_d:
++; LA64-CONTRACT-OFF:       # %bb.0:
++; LA64-CONTRACT-OFF-NEXT:    fmul.d $fa0, $fa0, $fa1
+ ; LA64-CONTRACT-OFF-NEXT:    fsub.d $fa0, $fa2, $fa0
+ ; LA64-CONTRACT-OFF-NEXT:    ret
+   %nega = fneg double %a
+@@ -483,6 +571,86 @@ define double @contract_fnmsub_d(double %a, double %b, double %c) nounwind {
+ ; LA64-CONTRACT-OFF-LABEL: contract_fnmsub_d:
+ ; LA64-CONTRACT-OFF:       # %bb.0:
+ ; LA64-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-OFF-NEXT:    ret
++  %negc = fneg contract double %c
++  %mul = fmul contract double %a, %b
++  %add = fadd contract double %mul, %negc
++  %neg = fneg contract double %add
++  ret double %neg
++}
++
++define double @contract_fnmsub_d_nsz(double %a, double %b, double %c) nounwind {
++; LA32-CONTRACT-FAST-LABEL: contract_fnmsub_d_nsz:
++; LA32-CONTRACT-FAST:       # %bb.0:
++; LA32-CONTRACT-FAST-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-FAST-NEXT:    ret
++;
++; LA32-CONTRACT-ON-LABEL: contract_fnmsub_d_nsz:
++; LA32-CONTRACT-ON:       # %bb.0:
++; LA32-CONTRACT-ON-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-ON-NEXT:    ret
++;
++; LA32-CONTRACT-OFF-LABEL: contract_fnmsub_d_nsz:
++; LA32-CONTRACT-OFF:       # %bb.0:
++; LA32-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-OFF-NEXT:    ret
++;
++; LA64-CONTRACT-FAST-LABEL: contract_fnmsub_d_nsz:
++; LA64-CONTRACT-FAST:       # %bb.0:
++; LA64-CONTRACT-FAST-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-FAST-NEXT:    ret
++;
++; LA64-CONTRACT-ON-LABEL: contract_fnmsub_d_nsz:
++; LA64-CONTRACT-ON:       # %bb.0:
++; LA64-CONTRACT-ON-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-ON-NEXT:    ret
++;
++; LA64-CONTRACT-OFF-LABEL: contract_fnmsub_d_nsz:
++; LA64-CONTRACT-OFF:       # %bb.0:
++; LA64-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-OFF-NEXT:    ret
++  %nega = fneg contract nsz double %a
++  %mul = fmul contract nsz double %nega, %b
++  %add = fadd contract nsz double %mul, %c
++  ret double %add
++}
++
++;; Check that fnmsub.d is not emitted.
++define double @not_contract_fnmsub_d(double %a, double %b, double %c) nounwind {
++; LA32-CONTRACT-FAST-LABEL: not_contract_fnmsub_d:
++; LA32-CONTRACT-FAST:       # %bb.0:
++; LA32-CONTRACT-FAST-NEXT:    fneg.d $fa0, $fa0
++; LA32-CONTRACT-FAST-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-FAST-NEXT:    ret
++;
++; LA32-CONTRACT-ON-LABEL: not_contract_fnmsub_d:
++; LA32-CONTRACT-ON:       # %bb.0:
++; LA32-CONTRACT-ON-NEXT:    fneg.d $fa0, $fa0
++; LA32-CONTRACT-ON-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-ON-NEXT:    ret
++;
++; LA32-CONTRACT-OFF-LABEL: not_contract_fnmsub_d:
++; LA32-CONTRACT-OFF:       # %bb.0:
++; LA32-CONTRACT-OFF-NEXT:    fneg.d $fa0, $fa0
++; LA32-CONTRACT-OFF-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-OFF-NEXT:    ret
++;
++; LA64-CONTRACT-FAST-LABEL: not_contract_fnmsub_d:
++; LA64-CONTRACT-FAST:       # %bb.0:
++; LA64-CONTRACT-FAST-NEXT:    fneg.d $fa0, $fa0
++; LA64-CONTRACT-FAST-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-FAST-NEXT:    ret
++;
++; LA64-CONTRACT-ON-LABEL: not_contract_fnmsub_d:
++; LA64-CONTRACT-ON:       # %bb.0:
++; LA64-CONTRACT-ON-NEXT:    fneg.d $fa0, $fa0
++; LA64-CONTRACT-ON-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-ON-NEXT:    ret
++;
++; LA64-CONTRACT-OFF-LABEL: not_contract_fnmsub_d:
++; LA64-CONTRACT-OFF:       # %bb.0:
++; LA64-CONTRACT-OFF-NEXT:    fneg.d $fa0, $fa0
++; LA64-CONTRACT-OFF-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
+ ; LA64-CONTRACT-OFF-NEXT:    ret
+   %nega = fneg contract double %a
+   %mul = fmul contract double %nega, %b
+@@ -592,8 +760,8 @@ define double @fnmadd_d_intrinsics(double %a, double %b, double %c) nounwind {
+ ; LA64-CONTRACT-OFF-NEXT:    fnmadd.d $fa0, $fa0, $fa1, $fa2
+ ; LA64-CONTRACT-OFF-NEXT:    ret
+   %fma = call double @llvm.fma.f64(double %a, double %b, double %c)
+-  %neg = fneg double %fma
+-  ret double %neg
++  %negfma = fneg double %fma
++  ret double %negfma
+ }
+ 
+ define double @fnmadd_d_nsz_intrinsics(double %a, double %b, double %c) nounwind {
+@@ -704,44 +872,87 @@ define double @fnmsub_d_intrinsics(double %a, double %b, double %c) nounwind {
+ ; LA64-CONTRACT-OFF-LABEL: fnmsub_d_intrinsics:
+ ; LA64-CONTRACT-OFF:       # %bb.0:
+ ; LA64-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-OFF-NEXT:    ret
++  %negc = fneg double %c
++  %fma = call double @llvm.fma.f64(double %a, double %b, double %negc)
++  %negfma = fneg double %fma
++  ret double %negfma
++}
++
++define double @fnmsub_d_nsz_intrinsics(double %a, double %b, double %c) nounwind {
++; LA32-CONTRACT-FAST-LABEL: fnmsub_d_nsz_intrinsics:
++; LA32-CONTRACT-FAST:       # %bb.0:
++; LA32-CONTRACT-FAST-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-FAST-NEXT:    ret
++;
++; LA32-CONTRACT-ON-LABEL: fnmsub_d_nsz_intrinsics:
++; LA32-CONTRACT-ON:       # %bb.0:
++; LA32-CONTRACT-ON-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-ON-NEXT:    ret
++;
++; LA32-CONTRACT-OFF-LABEL: fnmsub_d_nsz_intrinsics:
++; LA32-CONTRACT-OFF:       # %bb.0:
++; LA32-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-OFF-NEXT:    ret
++;
++; LA64-CONTRACT-FAST-LABEL: fnmsub_d_nsz_intrinsics:
++; LA64-CONTRACT-FAST:       # %bb.0:
++; LA64-CONTRACT-FAST-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-FAST-NEXT:    ret
++;
++; LA64-CONTRACT-ON-LABEL: fnmsub_d_nsz_intrinsics:
++; LA64-CONTRACT-ON:       # %bb.0:
++; LA64-CONTRACT-ON-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-ON-NEXT:    ret
++;
++; LA64-CONTRACT-OFF-LABEL: fnmsub_d_nsz_intrinsics:
++; LA64-CONTRACT-OFF:       # %bb.0:
++; LA64-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
+ ; LA64-CONTRACT-OFF-NEXT:    ret
+   %nega = fneg double %a
+-  %fma = call double @llvm.fma.f64(double %nega, double %b, double %c)
++  %fma = call nsz double @llvm.fma.f64(double %nega, double %b, double %c)
+   ret double %fma
+ }
+ 
+-define double @fnmsub_d_swap_intrinsics(double %a, double %b, double %c) nounwind {
+-; LA32-CONTRACT-FAST-LABEL: fnmsub_d_swap_intrinsics:
++;; Check that fnmsub.d is not emitted.
++define double @not_fnmsub_d_intrinsics(double %a, double %b, double %c) nounwind {
++; LA32-CONTRACT-FAST-LABEL: not_fnmsub_d_intrinsics:
+ ; LA32-CONTRACT-FAST:       # %bb.0:
+-; LA32-CONTRACT-FAST-NEXT:    fnmsub.d $fa0, $fa1, $fa0, $fa2
++; LA32-CONTRACT-FAST-NEXT:    fneg.d $fa0, $fa0
++; LA32-CONTRACT-FAST-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
+ ; LA32-CONTRACT-FAST-NEXT:    ret
+ ;
+-; LA32-CONTRACT-ON-LABEL: fnmsub_d_swap_intrinsics:
++; LA32-CONTRACT-ON-LABEL: not_fnmsub_d_intrinsics:
+ ; LA32-CONTRACT-ON:       # %bb.0:
+-; LA32-CONTRACT-ON-NEXT:    fnmsub.d $fa0, $fa1, $fa0, $fa2
++; LA32-CONTRACT-ON-NEXT:    fneg.d $fa0, $fa0
++; LA32-CONTRACT-ON-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
+ ; LA32-CONTRACT-ON-NEXT:    ret
+ ;
+-; LA32-CONTRACT-OFF-LABEL: fnmsub_d_swap_intrinsics:
++; LA32-CONTRACT-OFF-LABEL: not_fnmsub_d_intrinsics:
+ ; LA32-CONTRACT-OFF:       # %bb.0:
+-; LA32-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa1, $fa0, $fa2
++; LA32-CONTRACT-OFF-NEXT:    fneg.d $fa0, $fa0
++; LA32-CONTRACT-OFF-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
+ ; LA32-CONTRACT-OFF-NEXT:    ret
+ ;
+-; LA64-CONTRACT-FAST-LABEL: fnmsub_d_swap_intrinsics:
++; LA64-CONTRACT-FAST-LABEL: not_fnmsub_d_intrinsics:
+ ; LA64-CONTRACT-FAST:       # %bb.0:
+-; LA64-CONTRACT-FAST-NEXT:    fnmsub.d $fa0, $fa1, $fa0, $fa2
++; LA64-CONTRACT-FAST-NEXT:    fneg.d $fa0, $fa0
++; LA64-CONTRACT-FAST-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
+ ; LA64-CONTRACT-FAST-NEXT:    ret
+ ;
+-; LA64-CONTRACT-ON-LABEL: fnmsub_d_swap_intrinsics:
++; LA64-CONTRACT-ON-LABEL: not_fnmsub_d_intrinsics:
+ ; LA64-CONTRACT-ON:       # %bb.0:
+-; LA64-CONTRACT-ON-NEXT:    fnmsub.d $fa0, $fa1, $fa0, $fa2
++; LA64-CONTRACT-ON-NEXT:    fneg.d $fa0, $fa0
++; LA64-CONTRACT-ON-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
+ ; LA64-CONTRACT-ON-NEXT:    ret
+ ;
+-; LA64-CONTRACT-OFF-LABEL: fnmsub_d_swap_intrinsics:
++; LA64-CONTRACT-OFF-LABEL: not_fnmsub_d_intrinsics:
+ ; LA64-CONTRACT-OFF:       # %bb.0:
+-; LA64-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa1, $fa0, $fa2
++; LA64-CONTRACT-OFF-NEXT:    fneg.d $fa0, $fa0
++; LA64-CONTRACT-OFF-NEXT:    fmadd.d $fa0, $fa0, $fa1, $fa2
+ ; LA64-CONTRACT-OFF-NEXT:    ret
+-  %negb = fneg double %b
+-  %fma = call double @llvm.fma.f64(double %a, double %negb, double %c)
++  %nega = fneg double %a
++  %fma = call double @llvm.fma.f64(double %nega, double %b, double %c)
+   ret double %fma
+ }
+ 
+@@ -882,6 +1093,8 @@ define double @fnmsub_d_contract(double %a, double %b, double %c) nounwind {
+ ; LA64-CONTRACT-OFF-NEXT:    fnmsub.d $fa0, $fa0, $fa1, $fa2
+ ; LA64-CONTRACT-OFF-NEXT:    ret
+   %mul = fmul contract double %a, %b
+-  %sub = fsub contract double %c, %mul
+-  ret double %sub
++  %negc = fneg contract double %c
++  %add = fadd contract double %negc, %mul
++  %negadd = fneg contract double %add
++  ret double %negadd
+ }
+diff --git a/llvm/test/CodeGen/LoongArch/float-fma.ll b/llvm/test/CodeGen/LoongArch/float-fma.ll
+index 54dc56784006..c236255d971a 100644
+--- a/llvm/test/CodeGen/LoongArch/float-fma.ll
++++ b/llvm/test/CodeGen/LoongArch/float-fma.ll
+@@ -236,13 +236,15 @@ define float @fnmsub_s(float %a, float %b, float %c) nounwind {
+ ; LA32-CONTRACT-ON-LABEL: fnmsub_s:
+ ; LA32-CONTRACT-ON:       # %bb.0:
+ ; LA32-CONTRACT-ON-NEXT:    fmul.s $fa0, $fa0, $fa1
+-; LA32-CONTRACT-ON-NEXT:    fsub.s $fa0, $fa2, $fa0
++; LA32-CONTRACT-ON-NEXT:    fsub.s $fa0, $fa0, $fa2
++; LA32-CONTRACT-ON-NEXT:    fneg.s $fa0, $fa0
+ ; LA32-CONTRACT-ON-NEXT:    ret
+ ;
+ ; LA32-CONTRACT-OFF-LABEL: fnmsub_s:
+ ; LA32-CONTRACT-OFF:       # %bb.0:
+ ; LA32-CONTRACT-OFF-NEXT:    fmul.s $fa0, $fa0, $fa1
+-; LA32-CONTRACT-OFF-NEXT:    fsub.s $fa0, $fa2, $fa0
++; LA32-CONTRACT-OFF-NEXT:    fsub.s $fa0, $fa0, $fa2
++; LA32-CONTRACT-OFF-NEXT:    fneg.s $fa0, $fa0
+ ; LA32-CONTRACT-OFF-NEXT:    ret
+ ;
+ ; LA64-CONTRACT-FAST-LABEL: fnmsub_s:
+@@ -253,12 +255,98 @@ define float @fnmsub_s(float %a, float %b, float %c) nounwind {
+ ; LA64-CONTRACT-ON-LABEL: fnmsub_s:
+ ; LA64-CONTRACT-ON:       # %bb.0:
+ ; LA64-CONTRACT-ON-NEXT:    fmul.s $fa0, $fa0, $fa1
+-; LA64-CONTRACT-ON-NEXT:    fsub.s $fa0, $fa2, $fa0
++; LA64-CONTRACT-ON-NEXT:    fsub.s $fa0, $fa0, $fa2
++; LA64-CONTRACT-ON-NEXT:    fneg.s $fa0, $fa0
+ ; LA64-CONTRACT-ON-NEXT:    ret
+ ;
+ ; LA64-CONTRACT-OFF-LABEL: fnmsub_s:
+ ; LA64-CONTRACT-OFF:       # %bb.0:
+ ; LA64-CONTRACT-OFF-NEXT:    fmul.s $fa0, $fa0, $fa1
++; LA64-CONTRACT-OFF-NEXT:    fsub.s $fa0, $fa0, $fa2
++; LA64-CONTRACT-OFF-NEXT:    fneg.s $fa0, $fa0
++; LA64-CONTRACT-OFF-NEXT:    ret
++  %negc = fneg float %c
++  %mul = fmul float %a, %b
++  %add = fadd float %mul, %negc
++  %neg = fneg float %add
++  ret float %neg
++}
++
++define float @fnmsub_s_nsz(float %a, float %b, float %c) nounwind {
++; LA32-CONTRACT-FAST-LABEL: fnmsub_s_nsz:
++; LA32-CONTRACT-FAST:       # %bb.0:
++; LA32-CONTRACT-FAST-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-FAST-NEXT:    ret
++;
++; LA32-CONTRACT-ON-LABEL: fnmsub_s_nsz:
++; LA32-CONTRACT-ON:       # %bb.0:
++; LA32-CONTRACT-ON-NEXT:    fmul.s $fa0, $fa0, $fa1
++; LA32-CONTRACT-ON-NEXT:    fsub.s $fa0, $fa2, $fa0
++; LA32-CONTRACT-ON-NEXT:    ret
++;
++; LA32-CONTRACT-OFF-LABEL: fnmsub_s_nsz:
++; LA32-CONTRACT-OFF:       # %bb.0:
++; LA32-CONTRACT-OFF-NEXT:    fmul.s $fa0, $fa0, $fa1
++; LA32-CONTRACT-OFF-NEXT:    fsub.s $fa0, $fa2, $fa0
++; LA32-CONTRACT-OFF-NEXT:    ret
++;
++; LA64-CONTRACT-FAST-LABEL: fnmsub_s_nsz:
++; LA64-CONTRACT-FAST:       # %bb.0:
++; LA64-CONTRACT-FAST-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-FAST-NEXT:    ret
++;
++; LA64-CONTRACT-ON-LABEL: fnmsub_s_nsz:
++; LA64-CONTRACT-ON:       # %bb.0:
++; LA64-CONTRACT-ON-NEXT:    fmul.s $fa0, $fa0, $fa1
++; LA64-CONTRACT-ON-NEXT:    fsub.s $fa0, $fa2, $fa0
++; LA64-CONTRACT-ON-NEXT:    ret
++;
++; LA64-CONTRACT-OFF-LABEL: fnmsub_s_nsz:
++; LA64-CONTRACT-OFF:       # %bb.0:
++; LA64-CONTRACT-OFF-NEXT:    fmul.s $fa0, $fa0, $fa1
++; LA64-CONTRACT-OFF-NEXT:    fsub.s $fa0, $fa2, $fa0
++; LA64-CONTRACT-OFF-NEXT:    ret
++  %nega = fneg nsz float %a
++  %mul = fmul nsz float %nega, %b
++  %add = fadd nsz float %mul, %c
++  ret float %add
++}
++
++;; Check that fnmsub.s is not emitted.
++define float @not_fnmsub_s(float %a, float %b, float %c) nounwind {
++; LA32-CONTRACT-FAST-LABEL: not_fnmsub_s:
++; LA32-CONTRACT-FAST:       # %bb.0:
++; LA32-CONTRACT-FAST-NEXT:    fneg.s $fa0, $fa0
++; LA32-CONTRACT-FAST-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-FAST-NEXT:    ret
++;
++; LA32-CONTRACT-ON-LABEL: not_fnmsub_s:
++; LA32-CONTRACT-ON:       # %bb.0:
++; LA32-CONTRACT-ON-NEXT:    fmul.s $fa0, $fa0, $fa1
++; LA32-CONTRACT-ON-NEXT:    fsub.s $fa0, $fa2, $fa0
++; LA32-CONTRACT-ON-NEXT:    ret
++;
++; LA32-CONTRACT-OFF-LABEL: not_fnmsub_s:
++; LA32-CONTRACT-OFF:       # %bb.0:
++; LA32-CONTRACT-OFF-NEXT:    fmul.s $fa0, $fa0, $fa1
++; LA32-CONTRACT-OFF-NEXT:    fsub.s $fa0, $fa2, $fa0
++; LA32-CONTRACT-OFF-NEXT:    ret
++;
++; LA64-CONTRACT-FAST-LABEL: not_fnmsub_s:
++; LA64-CONTRACT-FAST:       # %bb.0:
++; LA64-CONTRACT-FAST-NEXT:    fneg.s $fa0, $fa0
++; LA64-CONTRACT-FAST-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-FAST-NEXT:    ret
++;
++; LA64-CONTRACT-ON-LABEL: not_fnmsub_s:
++; LA64-CONTRACT-ON:       # %bb.0:
++; LA64-CONTRACT-ON-NEXT:    fmul.s $fa0, $fa0, $fa1
++; LA64-CONTRACT-ON-NEXT:    fsub.s $fa0, $fa2, $fa0
++; LA64-CONTRACT-ON-NEXT:    ret
++;
++; LA64-CONTRACT-OFF-LABEL: not_fnmsub_s:
++; LA64-CONTRACT-OFF:       # %bb.0:
++; LA64-CONTRACT-OFF-NEXT:    fmul.s $fa0, $fa0, $fa1
+ ; LA64-CONTRACT-OFF-NEXT:    fsub.s $fa0, $fa2, $fa0
+ ; LA64-CONTRACT-OFF-NEXT:    ret
+   %nega = fneg float %a
+@@ -483,6 +571,86 @@ define float @contract_fnmsub_s(float %a, float %b, float %c) nounwind {
+ ; LA64-CONTRACT-OFF-LABEL: contract_fnmsub_s:
+ ; LA64-CONTRACT-OFF:       # %bb.0:
+ ; LA64-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-OFF-NEXT:    ret
++  %negc = fneg contract float %c
++  %mul = fmul contract float %a, %b
++  %add = fadd contract float %mul, %negc
++  %neg = fneg contract float %add
++  ret float %neg
++}
++
++define float @contract_fnmsub_s_nsz(float %a, float %b, float %c) nounwind {
++; LA32-CONTRACT-FAST-LABEL: contract_fnmsub_s_nsz:
++; LA32-CONTRACT-FAST:       # %bb.0:
++; LA32-CONTRACT-FAST-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-FAST-NEXT:    ret
++;
++; LA32-CONTRACT-ON-LABEL: contract_fnmsub_s_nsz:
++; LA32-CONTRACT-ON:       # %bb.0:
++; LA32-CONTRACT-ON-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-ON-NEXT:    ret
++;
++; LA32-CONTRACT-OFF-LABEL: contract_fnmsub_s_nsz:
++; LA32-CONTRACT-OFF:       # %bb.0:
++; LA32-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-OFF-NEXT:    ret
++;
++; LA64-CONTRACT-FAST-LABEL: contract_fnmsub_s_nsz:
++; LA64-CONTRACT-FAST:       # %bb.0:
++; LA64-CONTRACT-FAST-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-FAST-NEXT:    ret
++;
++; LA64-CONTRACT-ON-LABEL: contract_fnmsub_s_nsz:
++; LA64-CONTRACT-ON:       # %bb.0:
++; LA64-CONTRACT-ON-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-ON-NEXT:    ret
++;
++; LA64-CONTRACT-OFF-LABEL: contract_fnmsub_s_nsz:
++; LA64-CONTRACT-OFF:       # %bb.0:
++; LA64-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-OFF-NEXT:    ret
++  %nega = fneg contract nsz float %a
++  %mul = fmul contract nsz float %nega, %b
++  %add = fadd contract nsz float %mul, %c
++  ret float %add
++}
++
++;; Check that fnmsub.s is not emitted.
++define float @not_contract_fnmsub_s(float %a, float %b, float %c) nounwind {
++; LA32-CONTRACT-FAST-LABEL: not_contract_fnmsub_s:
++; LA32-CONTRACT-FAST:       # %bb.0:
++; LA32-CONTRACT-FAST-NEXT:    fneg.s $fa0, $fa0
++; LA32-CONTRACT-FAST-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-FAST-NEXT:    ret
++;
++; LA32-CONTRACT-ON-LABEL: not_contract_fnmsub_s:
++; LA32-CONTRACT-ON:       # %bb.0:
++; LA32-CONTRACT-ON-NEXT:    fneg.s $fa0, $fa0
++; LA32-CONTRACT-ON-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-ON-NEXT:    ret
++;
++; LA32-CONTRACT-OFF-LABEL: not_contract_fnmsub_s:
++; LA32-CONTRACT-OFF:       # %bb.0:
++; LA32-CONTRACT-OFF-NEXT:    fneg.s $fa0, $fa0
++; LA32-CONTRACT-OFF-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-OFF-NEXT:    ret
++;
++; LA64-CONTRACT-FAST-LABEL: not_contract_fnmsub_s:
++; LA64-CONTRACT-FAST:       # %bb.0:
++; LA64-CONTRACT-FAST-NEXT:    fneg.s $fa0, $fa0
++; LA64-CONTRACT-FAST-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-FAST-NEXT:    ret
++;
++; LA64-CONTRACT-ON-LABEL: not_contract_fnmsub_s:
++; LA64-CONTRACT-ON:       # %bb.0:
++; LA64-CONTRACT-ON-NEXT:    fneg.s $fa0, $fa0
++; LA64-CONTRACT-ON-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-ON-NEXT:    ret
++;
++; LA64-CONTRACT-OFF-LABEL: not_contract_fnmsub_s:
++; LA64-CONTRACT-OFF:       # %bb.0:
++; LA64-CONTRACT-OFF-NEXT:    fneg.s $fa0, $fa0
++; LA64-CONTRACT-OFF-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
+ ; LA64-CONTRACT-OFF-NEXT:    ret
+   %nega = fneg contract float %a
+   %mul = fmul contract float %nega, %b
+@@ -592,8 +760,8 @@ define float @fnmadd_s_intrinsics(float %a, float %b, float %c) nounwind {
+ ; LA64-CONTRACT-OFF-NEXT:    fnmadd.s $fa0, $fa0, $fa1, $fa2
+ ; LA64-CONTRACT-OFF-NEXT:    ret
+   %fma = call float @llvm.fma.f64(float %a, float %b, float %c)
+-  %neg = fneg float %fma
+-  ret float %neg
++  %negfma = fneg float %fma
++  ret float %negfma
+ }
+ 
+ define float @fnmadd_s_nsz_intrinsics(float %a, float %b, float %c) nounwind {
+@@ -704,44 +872,87 @@ define float @fnmsub_s_intrinsics(float %a, float %b, float %c) nounwind {
+ ; LA64-CONTRACT-OFF-LABEL: fnmsub_s_intrinsics:
+ ; LA64-CONTRACT-OFF:       # %bb.0:
+ ; LA64-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-OFF-NEXT:    ret
++  %negc = fneg float %c
++  %fma = call float @llvm.fma.f64(float %a, float %b, float %negc)
++  %negfma = fneg float %fma
++  ret float %negfma
++}
++
++define float @fnmsub_s_nsz_intrinsics(float %a, float %b, float %c) nounwind {
++; LA32-CONTRACT-FAST-LABEL: fnmsub_s_nsz_intrinsics:
++; LA32-CONTRACT-FAST:       # %bb.0:
++; LA32-CONTRACT-FAST-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-FAST-NEXT:    ret
++;
++; LA32-CONTRACT-ON-LABEL: fnmsub_s_nsz_intrinsics:
++; LA32-CONTRACT-ON:       # %bb.0:
++; LA32-CONTRACT-ON-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-ON-NEXT:    ret
++;
++; LA32-CONTRACT-OFF-LABEL: fnmsub_s_nsz_intrinsics:
++; LA32-CONTRACT-OFF:       # %bb.0:
++; LA32-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
++; LA32-CONTRACT-OFF-NEXT:    ret
++;
++; LA64-CONTRACT-FAST-LABEL: fnmsub_s_nsz_intrinsics:
++; LA64-CONTRACT-FAST:       # %bb.0:
++; LA64-CONTRACT-FAST-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-FAST-NEXT:    ret
++;
++; LA64-CONTRACT-ON-LABEL: fnmsub_s_nsz_intrinsics:
++; LA64-CONTRACT-ON:       # %bb.0:
++; LA64-CONTRACT-ON-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
++; LA64-CONTRACT-ON-NEXT:    ret
++;
++; LA64-CONTRACT-OFF-LABEL: fnmsub_s_nsz_intrinsics:
++; LA64-CONTRACT-OFF:       # %bb.0:
++; LA64-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
+ ; LA64-CONTRACT-OFF-NEXT:    ret
+   %nega = fneg float %a
+-  %fma = call float @llvm.fma.f64(float %nega, float %b, float %c)
++  %fma = call nsz float @llvm.fma.f64(float %nega, float %b, float %c)
+   ret float %fma
+ }
+ 
+-define float @fnmsub_s_swap_intrinsics(float %a, float %b, float %c) nounwind {
+-; LA32-CONTRACT-FAST-LABEL: fnmsub_s_swap_intrinsics:
++;; Check that fnmsub.s is not emitted.
++define float @not_fnmsub_s_intrinsics(float %a, float %b, float %c) nounwind {
++; LA32-CONTRACT-FAST-LABEL: not_fnmsub_s_intrinsics:
+ ; LA32-CONTRACT-FAST:       # %bb.0:
+-; LA32-CONTRACT-FAST-NEXT:    fnmsub.s $fa0, $fa1, $fa0, $fa2
++; LA32-CONTRACT-FAST-NEXT:    fneg.s $fa0, $fa0
++; LA32-CONTRACT-FAST-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
+ ; LA32-CONTRACT-FAST-NEXT:    ret
+ ;
+-; LA32-CONTRACT-ON-LABEL: fnmsub_s_swap_intrinsics:
++; LA32-CONTRACT-ON-LABEL: not_fnmsub_s_intrinsics:
+ ; LA32-CONTRACT-ON:       # %bb.0:
+-; LA32-CONTRACT-ON-NEXT:    fnmsub.s $fa0, $fa1, $fa0, $fa2
++; LA32-CONTRACT-ON-NEXT:    fneg.s $fa0, $fa0
++; LA32-CONTRACT-ON-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
+ ; LA32-CONTRACT-ON-NEXT:    ret
+ ;
+-; LA32-CONTRACT-OFF-LABEL: fnmsub_s_swap_intrinsics:
++; LA32-CONTRACT-OFF-LABEL: not_fnmsub_s_intrinsics:
+ ; LA32-CONTRACT-OFF:       # %bb.0:
+-; LA32-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa1, $fa0, $fa2
++; LA32-CONTRACT-OFF-NEXT:    fneg.s $fa0, $fa0
++; LA32-CONTRACT-OFF-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
+ ; LA32-CONTRACT-OFF-NEXT:    ret
+ ;
+-; LA64-CONTRACT-FAST-LABEL: fnmsub_s_swap_intrinsics:
++; LA64-CONTRACT-FAST-LABEL: not_fnmsub_s_intrinsics:
+ ; LA64-CONTRACT-FAST:       # %bb.0:
+-; LA64-CONTRACT-FAST-NEXT:    fnmsub.s $fa0, $fa1, $fa0, $fa2
++; LA64-CONTRACT-FAST-NEXT:    fneg.s $fa0, $fa0
++; LA64-CONTRACT-FAST-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
+ ; LA64-CONTRACT-FAST-NEXT:    ret
+ ;
+-; LA64-CONTRACT-ON-LABEL: fnmsub_s_swap_intrinsics:
++; LA64-CONTRACT-ON-LABEL: not_fnmsub_s_intrinsics:
+ ; LA64-CONTRACT-ON:       # %bb.0:
+-; LA64-CONTRACT-ON-NEXT:    fnmsub.s $fa0, $fa1, $fa0, $fa2
++; LA64-CONTRACT-ON-NEXT:    fneg.s $fa0, $fa0
++; LA64-CONTRACT-ON-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
+ ; LA64-CONTRACT-ON-NEXT:    ret
+ ;
+-; LA64-CONTRACT-OFF-LABEL: fnmsub_s_swap_intrinsics:
++; LA64-CONTRACT-OFF-LABEL: not_fnmsub_s_intrinsics:
+ ; LA64-CONTRACT-OFF:       # %bb.0:
+-; LA64-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa1, $fa0, $fa2
++; LA64-CONTRACT-OFF-NEXT:    fneg.s $fa0, $fa0
++; LA64-CONTRACT-OFF-NEXT:    fmadd.s $fa0, $fa0, $fa1, $fa2
+ ; LA64-CONTRACT-OFF-NEXT:    ret
+-  %negb = fneg float %b
+-  %fma = call float @llvm.fma.f64(float %a, float %negb, float %c)
++  %nega = fneg float %a
++  %fma = call float @llvm.fma.f64(float %nega, float %b, float %c)
+   ret float %fma
+ }
+ 
+@@ -882,6 +1093,8 @@ define float @fnmsub_s_contract(float %a, float %b, float %c) nounwind {
+ ; LA64-CONTRACT-OFF-NEXT:    fnmsub.s $fa0, $fa0, $fa1, $fa2
+ ; LA64-CONTRACT-OFF-NEXT:    ret
+   %mul = fmul contract float %a, %b
+-  %sub = fsub contract float %c, %mul
+-  ret float %sub
++  %negc = fneg contract float %c
++  %add = fadd contract float %negc, %mul
++  %negadd = fneg contract float %add
++  ret float %negadd
+ }
+-- 
+2.20.1
+
+
+From 7a3bd125d9c1d0265b265ce238a88d0d4550e5a0 Mon Sep 17 00:00:00 2001
+From: Weining Lu <luweining@loongson.cn>
+Date: Wed, 3 Jan 2024 13:59:12 +0800
+Subject: [PATCH 13/14] [LoongArch] Fix the procossor series mask
+
+Refer PRID_SERIES_MASK definition in linux kernel:
+arch/loongarch/include/asm/cpu.h.
+
+(cherry picked from commit 7e186d366d6c7def0543acc255931f617e76dff0)
+---
+ llvm/lib/TargetParser/Host.cpp | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
+index 81309280a44b..d11dc605e188 100644
+--- a/llvm/lib/TargetParser/Host.cpp
++++ b/llvm/lib/TargetParser/Host.cpp
+@@ -1462,7 +1462,8 @@ StringRef sys::getHostCPUName() {
+   // Use processor id to detect cpu name.
+   uint32_t processor_id;
+   __asm__("cpucfg %[prid], $zero\n\t" : [prid] "=r"(processor_id));
+-  switch (processor_id & 0xff00) {
++  // Refer PRID_SERIES_MASK in linux kernel: arch/loongarch/include/asm/cpu.h.
++  switch (processor_id & 0xf000) {
+   case 0xc000: // Loongson 64bit, 4-issue
+     return "la464";
+   // TODO: Others.
+-- 
+2.20.1
+
+
+From 3634ac4cbc475509c46521f5b8a3fcbeca6d06c7 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Mon, 11 Mar 2024 08:59:17 +0800
+Subject: [PATCH 14/14] [LoongArch] Make sure that the LoongArchISD::BSTRINS
+ node uses the correct `MSB` value (#84454)
+
+The `MSB` must not be greater than `GRLen`. Without this patch, newly
+added test cases will crash with LoongArch32, resulting in a 'cannot
+select' error.
+
+(cherry picked from commit edd4c6c6dca4c556de22b2ab73d5bfc02d28e59b)
+(cherry picked from commit d77c5c3830d925b3795e2f1535a6568399fe6626)
+---
+ llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp |  4 +++-
+ llvm/test/CodeGen/LoongArch/bstrins_w.ll            | 13 +++++++++++++
+ 2 files changed, 16 insertions(+), 1 deletion(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index ed106cb766bc..5affaf37ad5a 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -2310,7 +2310,9 @@ Retry:
+     return DAG.getNode(
+         LoongArchISD::BSTRINS, DL, ValTy, N0.getOperand(0),
+         DAG.getConstant(CN1->getSExtValue() >> MaskIdx0, DL, ValTy),
+-        DAG.getConstant((MaskIdx0 + MaskLen0 - 1), DL, GRLenVT),
++        DAG.getConstant(ValBits == 32 ? (MaskIdx0 + (MaskLen0 & 31) - 1)
++                                      : (MaskIdx0 + MaskLen0 - 1),
++                        DL, GRLenVT),
+         DAG.getConstant(MaskIdx0, DL, GRLenVT));
+   }
+ 
+diff --git a/llvm/test/CodeGen/LoongArch/bstrins_w.ll b/llvm/test/CodeGen/LoongArch/bstrins_w.ll
+index dfbe000841cd..e008caacad2a 100644
+--- a/llvm/test/CodeGen/LoongArch/bstrins_w.ll
++++ b/llvm/test/CodeGen/LoongArch/bstrins_w.ll
+@@ -145,6 +145,19 @@ define i32 @pat5(i32 %a) nounwind {
+   ret i32 %or
+ }
+ 
++;; The high bits of `const` are zero.
++define i32 @pat5_high_zeros(i32 %a) nounwind {
++; CHECK-LABEL: pat5_high_zeros:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    lu12i.w $a1, 1
++; CHECK-NEXT:    ori $a1, $a1, 564
++; CHECK-NEXT:    bstrins.w $a0, $a1, 31, 16
++; CHECK-NEXT:    ret
++  %and = and i32 %a, 65535      ; 0x0000ffff
++  %or = or i32 %and, 305397760  ; 0x12340000
++  ret i32 %or
++}
++
+ ;; Pattern 6: a = b | ((c & mask) << shamt)
+ ;; In this testcase b is 0x10000002, but in fact we do not require b being a
+ ;; constant. As long as all positions in b to be overwritten by the incoming
+-- 
+2.20.1
+
diff --git a/0013-Backport-Bitcode-Add-some-missing-GetTypeByID-failure-checks.patch b/0013-Backport-Bitcode-Add-some-missing-GetTypeByID-failure-checks.patch
new file mode 100644
index 0000000..0d365db
--- /dev/null
+++ b/0013-Backport-Bitcode-Add-some-missing-GetTypeByID-failure-checks.patch
@@ -0,0 +1,37 @@
+From 58f4b115f97435efa1df1baa3247791a13444c00 Mon Sep 17 00:00:00 2001
+From: Nikita Popov <npopov@redhat.com>
+Date: Tue, 26 Sep 2023 16:51:40 +0200
+Subject: [PATCH 1/3] [Bitcode] Add some missing GetTypeByID failure checks
+
+Print an error instead of crashing.
+
+Fixes https://github.com/llvm/llvm-project/issues/67388.
+---
+ llvm/lib/Bitcode/Reader/MetadataLoader.cpp | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+index 0a9a80688a41..2f024899b451 100644
+--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
++++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+@@ -1315,7 +1315,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
+ 
+     unsigned TyID = Record[0];
+     Type *Ty = Callbacks.GetTypeByID(TyID);
+-    if (Ty->isMetadataTy() || Ty->isVoidTy()) {
++    if (!Ty || Ty->isMetadataTy() || Ty->isVoidTy()) {
+       dropRecord();
+       break;
+     }
+@@ -1366,7 +1366,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
+ 
+     unsigned TyID = Record[0];
+     Type *Ty = Callbacks.GetTypeByID(TyID);
+-    if (Ty->isMetadataTy() || Ty->isVoidTy())
++    if (!Ty || Ty->isMetadataTy() || Ty->isVoidTy())
+       return error("Invalid record");
+ 
+     Value *V = ValueList.getValueFwdRef(Record[1], Ty, TyID,
+-- 
+2.33.0
+
diff --git a/0014-Backport-X86-Inline-Skip-inline-asm-in-inlining-targ.patch b/0014-Backport-X86-Inline-Skip-inline-asm-in-inlining-targ.patch
new file mode 100644
index 0000000..c43bbd3
--- /dev/null
+++ b/0014-Backport-X86-Inline-Skip-inline-asm-in-inlining-targ.patch
@@ -0,0 +1,74 @@
+From 678cf3a36644847cac4b0be2d919aba77416088a Mon Sep 17 00:00:00 2001
+From: Nikita Popov <npopov@redhat.com>
+Date: Mon, 04 Mar 2024 07:00:37 +0800
+Subject: [PATCH] [Backport][X86][Inline] Skip inline asm in inlining target
+ feature check
+
+When inlining across functions with different target features, we
+perform roughly two checks:
+1. The caller features must be a superset of the callee features.
+2. Calls in the callee cannot use types where the target features would
+change the call ABI (e.g. by changing whether something is passed in a
+zmm or two ymm registers). The latter check is very crude right now.
+
+The latter check currently also catches inline asm "calls". I believe
+that inline asm should be excluded from this check, as it is independent
+from the usual call ABI, and instead governed by the inline asm
+constraint string.
+---
+ .../lib/Target/X86/X86TargetTransformInfo.cpp |  4 +++
+ .../Inline/X86/call-abi-compatibility.ll      | 26 +++++++++++++++++++
+ 2 files changed, 30 insertions(+)
+
+diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+index 129a2646d..9c7954230 100644
+--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
++++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+@@ -6046,6 +6046,10 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
+ 
+   for (const Instruction &I : instructions(Callee)) {
+     if (const auto *CB = dyn_cast<CallBase>(&I)) {
++      // Having more target features is fine for inline ASM.
++      if (CB->isInlineAsm())
++        continue;
++
+       SmallVector<Type *, 8> Types;
+       for (Value *Arg : CB->args())
+         Types.push_back(Arg->getType());
+diff --git a/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll b/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll
+index 3a30980fe..6f582cab2 100644
+--- a/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll
++++ b/llvm/test/Transforms/Inline/X86/call-abi-compatibility.ll
+@@ -93,3 +93,29 @@ define internal void @caller_not_avx4() {
+ }
+ 
+ declare i64 @caller_unknown_simple(i64)
++
++; This call should get inlined, because the callee only contains
++; inline ASM, not real calls.
++define <8 x i64> @caller_inline_asm(ptr %p0, i64 %k, ptr %p1, ptr %p2) #0 {
++; CHECK-LABEL: define {{[^@]+}}@caller_inline_asm
++; CHECK-SAME: (ptr [[P0:%.*]], i64 [[K:%.*]], ptr [[P1:%.*]], ptr [[P2:%.*]]) #[[ATTR2:[0-9]+]] {
++; CHECK-NEXT:    [[SRC_I:%.*]] = load <8 x i64>, ptr [[P0]], align 64
++; CHECK-NEXT:    [[A_I:%.*]] = load <8 x i64>, ptr [[P1]], align 64
++; CHECK-NEXT:    [[B_I:%.*]] = load <8 x i64>, ptr [[P2]], align 64
++; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> asm "vpaddb\09$($3, $2, $0 {$1}", "=v,^Yk,v,v,0,~{dirflag},~{fpsr},~{flags}"(i64 [[K]], <8 x i64> [[A_I]], <8 x i64> [[B_I]], <8 x i64> [[SRC_I]])
++; CHECK-NEXT:    ret <8 x i64> [[TMP1]]
++;
++  %call = call <8 x i64> @callee_inline_asm(ptr %p0, i64 %k, ptr %p1, ptr %p2)
++  ret <8 x i64> %call
++}
++
++define internal <8 x i64> @callee_inline_asm(ptr %p0, i64 %k, ptr %p1, ptr %p2) #1 {
++  %src = load <8 x i64>, ptr %p0, align 64
++  %a = load <8 x i64>, ptr %p1, align 64
++  %b = load <8 x i64>, ptr %p2, align 64
++  %1 = tail call <8 x i64> asm "vpaddb\09$($3, $2, $0 {$1}", "=v,^Yk,v,v,0,~{dirflag},~{fpsr},~{flags}"(i64 %k, <8 x i64> %a, <8 x i64> %b, <8 x i64> %src) #2
++  ret <8 x i64> %1
++}
++
++attributes #0 = { "min-legal-vector-width"="512" "target-features"="+avx,+avx2,+avx512bw,+avx512dq,+avx512f,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
++attributes #1 = { "min-legal-vector-width"="512" "target-features"="+avx,+avx2,+avx512bw,+avx512f,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
+-- 
+2.33.0
+
diff --git a/0015-Backport-ARM-Check-all-terms-in-emitPopInst-when-clearing-Res.patch b/0015-Backport-ARM-Check-all-terms-in-emitPopInst-when-clearing-Res.patch
new file mode 100644
index 0000000..4528f50
--- /dev/null
+++ b/0015-Backport-ARM-Check-all-terms-in-emitPopInst-when-clearing-Res.patch
@@ -0,0 +1,87 @@
+From 4aec2da60ce3f639e31d81406c09d5c88b3b8f53 Mon Sep 17 00:00:00 2001
+From: Florian Hahn <flo@fhahn.com>
+Date: Wed, 20 Dec 2023 16:56:15 +0100
+Subject: [PATCH 2/3] [ARM] Check all terms in emitPopInst when clearing
+ Restored for LR. (#75527)
+
+emitPopInst checks a single function exit MBB. If other paths also exit
+the function and any of there terminators uses LR implicitly, it is not
+save to clear the Restored bit.
+
+Check all terminators for the function before clearing Restored.
+
+This fixes a mis-compile in outlined-fn-may-clobber-lr-in-caller.ll
+where the machine-outliner previously introduced BLs that clobbered LR
+which in turn is used by the tail call return.
+
+Alternative to #73553
+---
+ llvm/lib/Target/ARM/ARMFrameLowering.cpp | 30 +++++++++++++++++++++---
+ llvm/lib/Target/ARM/ARMFrameLowering.h   |  3 +++
+ 2 files changed, 30 insertions(+), 3 deletions(-)
+
+diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+index 4496d4928ebe..650f4650eef0 100644
+--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
++++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+@@ -1645,9 +1645,6 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
+         // Fold the return instruction into the LDM.
+         DeleteRet = true;
+         LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
+-        // We 'restore' LR into PC so it is not live out of the return block:
+-        // Clear Restored bit.
+-        Info.setRestored(false);
+       }
+ 
+       // If NoGap is true, pop consecutive registers and then leave the rest
+@@ -2769,6 +2766,33 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
+   AFI->setLRIsSpilled(SavedRegs.test(ARM::LR));
+ }
+ 
++void ARMFrameLowering::processFunctionBeforeFrameFinalized(
++    MachineFunction &MF, RegScavenger *RS) const {
++  TargetFrameLowering::processFunctionBeforeFrameFinalized(MF, RS);
++
++  MachineFrameInfo &MFI = MF.getFrameInfo();
++  if (!MFI.isCalleeSavedInfoValid())
++    return;
++
++  // Check if all terminators do not implicitly use LR. Then we can 'restore' LR
++  // into PC so it is not live out of the return block: Clear the Restored bit
++  // in that case.
++  for (CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) {
++    if (Info.getReg() != ARM::LR)
++      continue;
++    if (all_of(MF, [](const MachineBasicBlock &MBB) {
++          return all_of(MBB.terminators(), [](const MachineInstr &Term) {
++            return !Term.isReturn() || Term.getOpcode() == ARM::LDMIA_RET ||
++                   Term.getOpcode() == ARM::t2LDMIA_RET ||
++                   Term.getOpcode() == ARM::tPOP_RET;
++          });
++        })) {
++      Info.setRestored(false);
++      break;
++    }
++  }
++}
++
+ void ARMFrameLowering::getCalleeSaves(const MachineFunction &MF,
+                                       BitVector &SavedRegs) const {
+   TargetFrameLowering::getCalleeSaves(MF, SavedRegs);
+diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.h b/llvm/lib/Target/ARM/ARMFrameLowering.h
+index 16f2ce6bea6f..8d2b8beb9a58 100644
+--- a/llvm/lib/Target/ARM/ARMFrameLowering.h
++++ b/llvm/lib/Target/ARM/ARMFrameLowering.h
+@@ -59,6 +59,9 @@ public:
+   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                             RegScavenger *RS) const override;
+ 
++  void processFunctionBeforeFrameFinalized(
++      MachineFunction &MF, RegScavenger *RS = nullptr) const override;
++
+   void adjustForSegmentedStacks(MachineFunction &MF,
+                                 MachineBasicBlock &MBB) const override;
+ 
+-- 
+2.33.0
+
diff --git a/0016-Backport-ARM-Update-IsRestored-for-LR-based-on-all-returns-82.patch b/0016-Backport-ARM-Update-IsRestored-for-LR-based-on-all-returns-82.patch
new file mode 100644
index 0000000..eb34372
--- /dev/null
+++ b/0016-Backport-ARM-Update-IsRestored-for-LR-based-on-all-returns-82.patch
@@ -0,0 +1,116 @@
+From 369bfc8ea8c0a9da51b4bd964f0045cb389c3c2f Mon Sep 17 00:00:00 2001
+From: ostannard <oliver.stannard@arm.com>
+Date: Mon, 26 Feb 2024 12:23:25 +0000
+Subject: [PATCH 3/3] [ARM] Update IsRestored for LR based on all returns
+ (#82745)
+
+PR #75527 fixed ARMFrameLowering to set the IsRestored flag for LR based
+on all of the return instructions in the function, not just one.
+However, there is also code in ARMLoadStoreOptimizer which changes
+return instructions, but it set IsRestored based on the one instruction
+it changed, not the whole function.
+
+The fix is to factor out the code added in #75527, and also call it from
+ARMLoadStoreOptimizer if it made a change to return instructions.
+
+Fixes #80287.
+---
+ llvm/lib/Target/ARM/ARMFrameLowering.cpp      | 11 +++++----
+ llvm/lib/Target/ARM/ARMFrameLowering.h        |  4 ++++
+ llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 23 ++++++++-----------
+ 3 files changed, 21 insertions(+), 17 deletions(-)
+
+diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+index 650f4650eef0..008ba4e5924b 100644
+--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
++++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+@@ -2766,10 +2766,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
+   AFI->setLRIsSpilled(SavedRegs.test(ARM::LR));
+ }
+ 
+-void ARMFrameLowering::processFunctionBeforeFrameFinalized(
+-    MachineFunction &MF, RegScavenger *RS) const {
+-  TargetFrameLowering::processFunctionBeforeFrameFinalized(MF, RS);
+-
++void ARMFrameLowering::updateLRRestored(MachineFunction &MF) {
+   MachineFrameInfo &MFI = MF.getFrameInfo();
+   if (!MFI.isCalleeSavedInfoValid())
+     return;
+@@ -2793,6 +2790,12 @@ void ARMFrameLowering::processFunctionBeforeFrameFinalized(
+   }
+ }
+ 
++void ARMFrameLowering::processFunctionBeforeFrameFinalized(
++    MachineFunction &MF, RegScavenger *RS) const {
++  TargetFrameLowering::processFunctionBeforeFrameFinalized(MF, RS);
++  updateLRRestored(MF);
++}
++
+ void ARMFrameLowering::getCalleeSaves(const MachineFunction &MF,
+                                       BitVector &SavedRegs) const {
+   TargetFrameLowering::getCalleeSaves(MF, SavedRegs);
+diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.h b/llvm/lib/Target/ARM/ARMFrameLowering.h
+index 8d2b8beb9a58..3c7358d8cd53 100644
+--- a/llvm/lib/Target/ARM/ARMFrameLowering.h
++++ b/llvm/lib/Target/ARM/ARMFrameLowering.h
+@@ -59,6 +59,10 @@ public:
+   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                             RegScavenger *RS) const override;
+ 
++  /// Update the IsRestored flag on LR if it is spilled, based on the return
++  /// instructions.
++  static void updateLRRestored(MachineFunction &MF);
++
+   void processFunctionBeforeFrameFinalized(
+       MachineFunction &MF, RegScavenger *RS = nullptr) const override;
+ 
+diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+index 93db983b92c0..37d9e1addd1e 100644
+--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
++++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+@@ -2062,17 +2062,6 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
+       MO.setReg(ARM::PC);
+       PrevMI.copyImplicitOps(*MBB.getParent(), *MBBI);
+       MBB.erase(MBBI);
+-      // We now restore LR into PC so it is not live-out of the return block
+-      // anymore: Clear the CSI Restored bit.
+-      MachineFrameInfo &MFI = MBB.getParent()->getFrameInfo();
+-      // CSI should be fixed after PrologEpilog Insertion
+-      assert(MFI.isCalleeSavedInfoValid() && "CSI should be valid");
+-      for (CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) {
+-        if (Info.getReg() == ARM::LR) {
+-          Info.setRestored(false);
+-          break;
+-        }
+-      }
+       return true;
+     }
+   }
+@@ -2120,14 +2109,22 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+   isThumb2 = AFI->isThumb2Function();
+   isThumb1 = AFI->isThumbFunction() && !isThumb2;
+ 
+-  bool Modified = false;
++  bool Modified = false, ModifiedLDMReturn = false;
+   for (MachineBasicBlock &MBB : Fn) {
+     Modified |= LoadStoreMultipleOpti(MBB);
+     if (STI->hasV5TOps() && !AFI->shouldSignReturnAddress())
+-      Modified |= MergeReturnIntoLDM(MBB);
++      ModifiedLDMReturn |= MergeReturnIntoLDM(MBB);
+     if (isThumb1)
+       Modified |= CombineMovBx(MBB);
+   }
++  Modified |= ModifiedLDMReturn;
++
++  // If we merged a BX instruction into an LDM, we need to re-calculate whether
++  // LR is restored. This check needs to consider the whole function, not just
++  // the instruction(s) we changed, because there may be other BX returns which
++  // still need LR to be restored.
++  if (ModifiedLDMReturn)
++    ARMFrameLowering::updateLRRestored(Fn);
+ 
+   Allocator.DestroyAll();
+   return Modified;
+-- 
+2.33.0
+
diff --git a/0017-Add-the-support-for-classic-flang.patch b/0017-Add-the-support-for-classic-flang.patch
new file mode 100644
index 0000000..74bcbc8
--- /dev/null
+++ b/0017-Add-the-support-for-classic-flang.patch
@@ -0,0 +1,1776 @@
+From b297f30783da0dfb3098fe6d39b209caacd45691 Mon Sep 17 00:00:00 2001
+From: luofeng14 <luofeng13@huawei.com>
+Date: Fri, 1 Mar 2024 18:39:52 +0800
+Subject: [PATCH] sync classic flang patch
+
+---
+ llvm/cmake/modules/HandleLLVMOptions.cmake    |   8 +
+ llvm/include/llvm-c/DebugInfo.h               |   5 +-
+ .../include/llvm/Analysis/TargetLibraryInfo.h |   6 +-
+ llvm/include/llvm/Analysis/VecFuncs.def       | 592 ++++++++++++++++++
+ llvm/include/llvm/IR/DIBuilder.h              |  28 +-
+ llvm/include/llvm/IR/DebugInfoMetadata.h      |  32 +-
+ llvm/lib/Analysis/TargetLibraryInfo.cpp       |  38 +-
+ llvm/lib/AsmParser/LLParser.cpp               |  22 +-
+ llvm/lib/Bitcode/Reader/MetadataLoader.cpp    |  42 +-
+ llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     |   3 +-
+ llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h   |  54 ++
+ .../lib/CodeGen/AsmPrinter/DebugLocStream.cpp |   5 +
+ llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h  |   6 +
+ .../CodeGen/AsmPrinter/DwarfCompileUnit.cpp   |   3 +
+ llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h      |   4 +
+ llvm/lib/IR/AsmWriter.cpp                     |   1 +
+ llvm/lib/IR/DIBuilder.cpp                     |  12 +-
+ llvm/lib/IR/DebugInfo.cpp                     |  10 +-
+ llvm/lib/IR/DebugInfoMetadata.cpp             |  12 +-
+ llvm/lib/IR/LLVMContextImpl.h                 |  11 +-
+ .../Instrumentation/InstrProfiling.cpp        |   4 +-
+ .../invalid-diglobalvariable-empty-name.ll    |   1 +
+ .../Generic/fortran-subprogram-at.ll          |  24 +
+ .../DebugInfo/Generic/more-subprogram-attr.ll |  38 ++
+ llvm/test/DebugInfo/X86/DICommonBlock.ll      |  36 ++
+ llvm/test/lit.cfg.py                          |   3 +
+ llvm/test/lit.site.cfg.py.in                  |   1 +
+ llvm/tools/llvm-c-test/debuginfo.c            |   4 +-
+ llvm/unittests/IR/MetadataTest.cpp            |  47 +-
+ llvm/utils/lit/lit/TestingConfig.py           |   1 +
+ llvm/utils/lit/lit/llvm/config.py             |  10 +
+ 31 files changed, 983 insertions(+), 80 deletions(-)
+ create mode 100644 llvm/test/DebugInfo/Generic/fortran-subprogram-at.ll
+ create mode 100644 llvm/test/DebugInfo/Generic/more-subprogram-attr.ll
+ create mode 100644 llvm/test/DebugInfo/X86/DICommonBlock.ll
+
+diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
+index 76723be696e5..492ea25b179b 100644
+--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
++++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
+@@ -89,6 +89,14 @@ if( LLVM_ENABLE_ASSERTIONS )
+   add_compile_definitions(_LIBCPP_ENABLE_HARDENED_MODE)
+ endif()
+ 
++option(LLVM_ENABLE_CLASSIC_FLANG "Build support for classic Flang instead of the new built-in Flang" OFF)
++if(LLVM_ENABLE_CLASSIC_FLANG)
++  set(LLVM_ENABLE_CLASSIC_FLANG 1)
++  add_definitions( -DENABLE_CLASSIC_FLANG )
++else()
++  set(LLVM_ENABLE_CLASSIC_FLANG 0)
++endif()
++
+ if(LLVM_ENABLE_EXPENSIVE_CHECKS)
+   add_compile_definitions(EXPENSIVE_CHECKS)
+ 
+diff --git a/llvm/include/llvm-c/DebugInfo.h b/llvm/include/llvm-c/DebugInfo.h
+index 5924294708cc..09d584c24711 100644
+--- a/llvm/include/llvm-c/DebugInfo.h
++++ b/llvm/include/llvm-c/DebugInfo.h
+@@ -1148,7 +1148,8 @@ LLVMMetadataRef LLVMDIBuilderCreateGlobalVariableExpression(
+     LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+     size_t NameLen, const char *Linkage, size_t LinkLen, LLVMMetadataRef File,
+     unsigned LineNo, LLVMMetadataRef Ty, LLVMBool LocalToUnit,
+-    LLVMMetadataRef Expr, LLVMMetadataRef Decl, uint32_t AlignInBits);
++    LLVMMetadataRef Expr, LLVMMetadataRef Decl, LLVMDIFlags Flags,
++    uint32_t AlignInBits);
+ 
+ 
+ /**
+@@ -1246,7 +1247,7 @@ LLVMMetadataRef LLVMDIBuilderCreateTempGlobalVariableFwdDecl(
+     LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+     size_t NameLen, const char *Linkage, size_t LnkLen, LLVMMetadataRef File,
+     unsigned LineNo, LLVMMetadataRef Ty, LLVMBool LocalToUnit,
+-    LLVMMetadataRef Decl, uint32_t AlignInBits);
++    LLVMMetadataRef Decl, LLVMDIFlags Flags, uint32_t AlignInBits);
+ 
+ /**
+  * Insert a new llvm.dbg.declare intrinsic call before the given instruction.
+diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+index 5d62e837c1f3..490252cd018a 100644
+--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
++++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+@@ -22,7 +22,6 @@ namespace llvm {
+ template <typename T> class ArrayRef;
+ class Function;
+ class Module;
+-class Triple;
+ 
+ /// Describes a possible vectorization of a function.
+ /// Function 'VectorFnName' is equivalent to 'ScalarFnName' vectorized
+@@ -81,6 +80,8 @@ class TargetLibraryInfoImpl {
+   bool isValidProtoForLibFunc(const FunctionType &FTy, LibFunc F,
+                               const Module &M) const;
+ 
++  Triple T;
++
+ public:
+   /// List of known vector-functions libraries.
+   ///
+@@ -95,6 +96,9 @@ public:
+     DarwinLibSystemM, // Use Darwin's libsystem_m.
+     LIBMVEC_X86,      // GLIBC Vector Math library.
+     MASSV,            // IBM MASS vector library.
++#ifdef ENABLE_CLASSIC_FLANG
++    PGMATH,           // PGI math library.
++#endif
+     SVML,             // Intel short vector math library.
+     SLEEFGNUABI, // SLEEF - SIMD Library for Evaluating Elementary Functions.
+     ArmPL        // Arm Performance Libraries.
+diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
+index b884c1e3911e..d1712d158423 100644
+--- a/llvm/include/llvm/Analysis/VecFuncs.def
++++ b/llvm/include/llvm/Analysis/VecFuncs.def
+@@ -909,6 +909,596 @@ TLI_DEFINE_VECFUNC("tgammaf", "armpl_vtgammaq_f32", FIXED(4), NOMASK)
+ TLI_DEFINE_VECFUNC("tgamma", "armpl_svtgamma_f64_x",  SCALABLE(2), MASKED)
+ TLI_DEFINE_VECFUNC("tgammaf", "armpl_svtgamma_f32_x", SCALABLE(4), MASKED)
+ 
++#elif defined(TLI_DEFINE_PGMATH_AARCH64_VECFUNCS)
++// Classic flang libpgmath library's Vector Functions for AArch64
++
++TLI_DEFINE_VECFUNC("__fd_sin_1", "__fd_sin_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_sin_1", "__fs_sin_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_sin_1", "__pd_sin_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_sin_1", "__ps_sin_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_sin_1", "__rd_sin_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_sin_1", "__rs_sin_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fd_cos_1", "__fd_cos_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_cos_1", "__fs_cos_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_cos_1", "__pd_cos_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_cos_1", "__ps_cos_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_cos_1", "__rd_cos_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_cos_1", "__rs_cos_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fd_sincos_1", "__fd_sincos_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_sincos_1", "__fs_sincos_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_sincos_1", "__pd_sincos_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_sincos_1", "__ps_sincos_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_sincos_1", "__rd_sincos_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_sincos_1", "__rs_sincos_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fd_tan_1", "__fd_tan_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_tan_1", "__fs_tan_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_tan_1", "__pd_tan_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_tan_1", "__ps_tan_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_tan_1", "__rd_tan_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_tan_1", "__rs_tan_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fd_sinh_1", "__fd_sinh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_sinh_1", "__fs_sinh_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_sinh_1", "__pd_sinh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_sinh_1", "__ps_sinh_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_sinh_1", "__rd_sinh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_sinh_1", "__rs_sinh_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fd_cosh_1", "__fd_cosh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_cosh_1", "__fs_cosh_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_cosh_1", "__pd_cosh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_cosh_1", "__ps_cosh_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_cosh_1", "__rd_cosh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_cosh_1", "__rs_cosh_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fd_tanh_1", "__fd_tanh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_tanh_1", "__fs_tanh_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_tanh_1", "__pd_tanh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_tanh_1", "__ps_tanh_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_tanh_1", "__rd_tanh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_tanh_1", "__rs_tanh_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fd_asin_1", "__fd_asin_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_asin_1", "__fs_asin_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_asin_1", "__pd_asin_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_asin_1", "__ps_asin_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_asin_1", "__rd_asin_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_asin_1", "__rs_asin_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fd_acos_1", "__fd_acos_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_acos_1", "__fs_acos_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_acos_1", "__pd_acos_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_acos_1", "__ps_acos_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_acos_1", "__rd_acos_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_acos_1", "__rs_acos_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fd_atan_1", "__fd_atan_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_atan_1", "__fs_atan_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_atan_1", "__pd_atan_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_atan_1", "__ps_atan_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_atan_1", "__rd_atan_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_atan_1", "__rs_atan_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fd_atan2_1", "__fd_atan2_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_atan2_1", "__fs_atan2_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_atan2_1", "__pd_atan2_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_atan2_1", "__ps_atan2_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_atan2_1", "__rd_atan2_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_atan2_1", "__rs_atan2_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fd_pow_1", "__fd_pow_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_pow_1", "__fs_pow_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_pow_1", "__pd_pow_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_pow_1", "__ps_pow_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_pow_1", "__rd_pow_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_pow_1", "__rs_pow_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fs_powi_1", "__fs_powi_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__ps_powi_1", "__ps_powi_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rs_powi_1", "__rs_powi_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fd_powi1_1", "__fd_powi1_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_powi1_1", "__fs_powi1_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_powi1_1", "__pd_powi1_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_powi1_1", "__ps_powi1_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_powi1_1", "__rd_powi1_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_powi1_1", "__rs_powi1_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fd_powk_1", "__fd_powk_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_powk_1", "__fs_powk_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_powk_1", "__pd_powk_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_powk_1", "__ps_powk_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_powk_1", "__rd_powk_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_powk_1", "__rs_powk_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fd_powk1_1", "__fd_powk1_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_powk1_1", "__fs_powk1_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_powk1_1", "__pd_powk1_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_powk1_1", "__ps_powk1_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_powk1_1", "__rd_powk1_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_powk1_1", "__rs_powk1_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fd_log10_1", "__fd_log10_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_log10_1", "__fs_log10_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_log10_1", "__pd_log10_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_log10_1", "__ps_log10_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_log10_1", "__rd_log10_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_log10_1", "__rs_log10_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fd_log_1", "__fd_log_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_log_1", "__fs_log_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_log_1", "__pd_log_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_log_1", "__ps_log_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_log_1", "__rd_log_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_log_1", "__rs_log_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__fd_exp_1", "__fd_exp_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fs_exp_1", "__fs_exp_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__pd_exp_1", "__pd_exp_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__ps_exp_1", "__ps_exp_4", FIXED(4))
++
++TLI_DEFINE_VECFUNC("__rd_exp_1", "__rd_exp_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rs_exp_1", "__rs_exp_4", FIXED(4))
++
++#elif defined(TLI_DEFINE_PGMATH_X86_VECFUNCS)
++// Classic flang libpgmath library's Vector Functions for X86
++
++TLI_DEFINE_VECFUNC("__fd_sin_1", "__fd_sin_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fd_sin_1", "__fd_sin_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fd_sin_1", "__fd_sin_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__fs_sin_1", "__fs_sin_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_sin_1", "__fs_sin_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_sin_1", "__fs_sin_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__pd_sin_1", "__pd_sin_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__pd_sin_1", "__pd_sin_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__pd_sin_1", "__pd_sin_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__ps_sin_1", "__ps_sin_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_sin_1", "__ps_sin_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_sin_1", "__ps_sin_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rd_sin_1", "__rd_sin_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rd_sin_1", "__rd_sin_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rd_sin_1", "__rd_sin_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__rs_sin_1", "__rs_sin_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_sin_1", "__rs_sin_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_sin_1", "__rs_sin_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__fd_cos_1", "__fd_cos_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fd_cos_1", "__fd_cos_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fd_cos_1", "__fd_cos_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__fs_cos_1", "__fs_cos_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_cos_1", "__fs_cos_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_cos_1", "__fs_cos_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__pd_cos_1", "__pd_cos_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__pd_cos_1", "__pd_cos_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__pd_cos_1", "__pd_cos_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__ps_cos_1", "__ps_cos_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_cos_1", "__ps_cos_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_cos_1", "__ps_cos_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rd_cos_1", "__rd_cos_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rd_cos_1", "__rd_cos_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rd_cos_1", "__rd_cos_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__rs_cos_1", "__rs_cos_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_cos_1", "__rs_cos_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_cos_1", "__rs_cos_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__fd_tan_1", "__fd_tan_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fd_tan_1", "__fd_tan_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fd_tan_1", "__fd_tan_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__fs_tan_1", "__fs_tan_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_tan_1", "__fs_tan_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_tan_1", "__fs_tan_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__pd_tan_1", "__pd_tan_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__pd_tan_1", "__pd_tan_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__pd_tan_1", "__pd_tan_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__ps_tan_1", "__ps_tan_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_tan_1", "__ps_tan_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_tan_1", "__ps_tan_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rd_tan_1", "__rd_tan_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rd_tan_1", "__rd_tan_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rd_tan_1", "__rd_tan_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__rs_tan_1", "__rs_tan_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_tan_1", "__rs_tan_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_tan_1", "__rs_tan_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__fd_sinh_1", "__fd_sinh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fd_sinh_1", "__fd_sinh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fd_sinh_1", "__fd_sinh_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__fs_sinh_1", "__fs_sinh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_sinh_1", "__fs_sinh_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_sinh_1", "__fs_sinh_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__pd_sinh_1", "__pd_sinh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__pd_sinh_1", "__pd_sinh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__pd_sinh_1", "__pd_sinh_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__ps_sinh_1", "__ps_sinh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_sinh_1", "__ps_sinh_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_sinh_1", "__ps_sinh_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rd_sinh_1", "__rd_sinh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rd_sinh_1", "__rd_sinh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rd_sinh_1", "__rd_sinh_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__rs_sinh_1", "__rs_sinh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_sinh_1", "__rs_sinh_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_sinh_1", "__rs_sinh_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__fd_cosh_1", "__fd_cosh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fd_cosh_1", "__fd_cosh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fd_cosh_1", "__fd_cosh_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__fs_cosh_1", "__fs_cosh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_cosh_1", "__fs_cosh_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_cosh_1", "__fs_cosh_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__pd_cosh_1", "__pd_cosh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__pd_cosh_1", "__pd_cosh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__pd_cosh_1", "__pd_cosh_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__ps_cosh_1", "__ps_cosh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_cosh_1", "__ps_cosh_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_cosh_1", "__ps_cosh_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rd_cosh_1", "__rd_cosh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rd_cosh_1", "__rd_cosh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rd_cosh_1", "__rd_cosh_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__rs_cosh_1", "__rs_cosh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_cosh_1", "__rs_cosh_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_cosh_1", "__rs_cosh_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__fd_tanh_1", "__fd_tanh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fd_tanh_1", "__fd_tanh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fd_tanh_1", "__fd_tanh_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__fs_tanh_1", "__fs_tanh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_tanh_1", "__fs_tanh_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_tanh_1", "__fs_tanh_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__pd_tanh_1", "__pd_tanh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__pd_tanh_1", "__pd_tanh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__pd_tanh_1", "__pd_tanh_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__ps_tanh_1", "__ps_tanh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_tanh_1", "__ps_tanh_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_tanh_1", "__ps_tanh_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rd_tanh_1", "__rd_tanh_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rd_tanh_1", "__rd_tanh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rd_tanh_1", "__rd_tanh_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__rs_tanh_1", "__rs_tanh_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_tanh_1", "__rs_tanh_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_tanh_1", "__rs_tanh_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__fd_asin_1", "__fd_asin_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fd_asin_1", "__fd_asin_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fd_asin_1", "__fd_asin_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__fs_asin_1", "__fs_asin_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_asin_1", "__fs_asin_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_asin_1", "__fs_asin_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__pd_asin_1", "__pd_asin_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__pd_asin_1", "__pd_asin_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__pd_asin_1", "__pd_asin_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__ps_asin_1", "__ps_asin_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_asin_1", "__ps_asin_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_asin_1", "__ps_asin_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rd_asin_1", "__rd_asin_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rd_asin_1", "__rd_asin_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rd_asin_1", "__rd_asin_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__rs_asin_1", "__rs_asin_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_asin_1", "__rs_asin_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_asin_1", "__rs_asin_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__fd_acos_1", "__fd_acos_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fd_acos_1", "__fd_acos_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fd_acos_1", "__fd_acos_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__fs_acos_1", "__fs_acos_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_acos_1", "__fs_acos_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_acos_1", "__fs_acos_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__pd_acos_1", "__pd_acos_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__pd_acos_1", "__pd_acos_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__pd_acos_1", "__pd_acos_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__ps_acos_1", "__ps_acos_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_acos_1", "__ps_acos_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_acos_1", "__ps_acos_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rd_acos_1", "__rd_acos_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rd_acos_1", "__rd_acos_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rd_acos_1", "__rd_acos_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__rs_acos_1", "__rs_acos_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_acos_1", "__rs_acos_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_acos_1", "__rs_acos_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__fd_atan_1", "__fd_atan_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fd_atan_1", "__fd_atan_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fd_atan_1", "__fd_atan_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__fs_atan_1", "__fs_atan_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_atan_1", "__fs_atan_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_atan_1", "__fs_atan_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__pd_atan_1", "__pd_atan_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__pd_atan_1", "__pd_atan_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__pd_atan_1", "__pd_atan_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__ps_atan_1", "__ps_atan_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_atan_1", "__ps_atan_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_atan_1", "__ps_atan_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rd_atan_1", "__rd_atan_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rd_atan_1", "__rd_atan_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rd_atan_1", "__rd_atan_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__rs_atan_1", "__rs_atan_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_atan_1", "__rs_atan_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_atan_1", "__rs_atan_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__fd_atan2_1", "__fd_atan2_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fd_atan2_1", "__fd_atan2_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fd_atan2_1", "__fd_atan2_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__fs_atan2_1", "__fs_atan2_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_atan2_1", "__fs_atan2_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_atan2_1", "__fs_atan2_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__pd_atan2_1", "__pd_atan2_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__pd_atan2_1", "__pd_atan2_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__pd_atan2_1", "__pd_atan2_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__ps_atan2_1", "__ps_atan2_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_atan2_1", "__ps_atan2_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_atan2_1", "__ps_atan2_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rd_atan2_1", "__rd_atan2_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rd_atan2_1", "__rd_atan2_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rd_atan2_1", "__rd_atan2_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__rs_atan2_1", "__rs_atan2_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_atan2_1", "__rs_atan2_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_atan2_1", "__rs_atan2_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__fd_pow_1", "__fd_pow_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fd_pow_1", "__fd_pow_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fd_pow_1", "__fd_pow_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__fs_pow_1", "__fs_pow_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_pow_1", "__fs_pow_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_pow_1", "__fs_pow_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__pd_pow_1", "__pd_pow_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__pd_pow_1", "__pd_pow_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__pd_pow_1", "__pd_pow_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__ps_pow_1", "__ps_pow_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_pow_1", "__ps_pow_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_pow_1", "__ps_pow_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rd_pow_1", "__rd_pow_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rd_pow_1", "__rd_pow_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rd_pow_1", "__rd_pow_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__rs_pow_1", "__rs_pow_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_pow_1", "__rs_pow_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_pow_1", "__rs_pow_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__fs_powi_1", "__fs_powi_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_powi_1", "__fs_powi_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_powi_1", "__fs_powi_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__ps_powi_1", "__ps_powi_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_powi_1", "__ps_powi_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_powi_1", "__ps_powi_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rs_powi_1", "__rs_powi_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_powi_1", "__rs_powi_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_powi_1", "__rs_powi_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__fd_powi1_1", "__fd_powi1_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fd_powi1_1", "__fd_powi1_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fd_powi1_1", "__fd_powi1_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__fs_powi1_1", "__fs_powi1_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_powi1_1", "__fs_powi1_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_powi1_1", "__fs_powi1_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__pd_powi1_1", "__pd_powi1_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__pd_powi1_1", "__pd_powi1_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__pd_powi1_1", "__pd_powi1_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__ps_powi1_1", "__ps_powi1_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_powi1_1", "__ps_powi1_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_powi1_1", "__ps_powi1_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rd_powi1_1", "__rd_powi1_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rd_powi1_1", "__rd_powi1_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rd_powi1_1", "__rd_powi1_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__rs_powi1_1", "__rs_powi1_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_powi1_1", "__rs_powi1_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_powi1_1", "__rs_powi1_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__fd_powk_1", "__fd_powk_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fd_powk_1", "__fd_powk_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fd_powk_1", "__fd_powk_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__fs_powk_1", "__fs_powk_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_powk_1", "__fs_powk_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_powk_1", "__fs_powk_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__pd_powk_1", "__pd_powk_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__pd_powk_1", "__pd_powk_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__pd_powk_1", "__pd_powk_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__ps_powk_1", "__ps_powk_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_powk_1", "__ps_powk_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_powk_1", "__ps_powk_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rd_powk_1", "__rd_powk_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rd_powk_1", "__rd_powk_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rd_powk_1", "__rd_powk_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__rs_powk_1", "__rs_powk_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_powk_1", "__rs_powk_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_powk_1", "__rs_powk_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__fd_powk1_1", "__fd_powk1_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fd_powk1_1", "__fd_powk1_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fd_powk1_1", "__fd_powk1_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__fs_powk1_1", "__fs_powk1_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_powk1_1", "__fs_powk1_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_powk1_1", "__fs_powk1_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__pd_powk1_1", "__pd_powk1_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__pd_powk1_1", "__pd_powk1_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__pd_powk1_1", "__pd_powk1_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__ps_powk1_1", "__ps_powk1_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_powk1_1", "__ps_powk1_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_powk1_1", "__ps_powk1_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rd_powk1_1", "__rd_powk1_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rd_powk1_1", "__rd_powk1_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rd_powk1_1", "__rd_powk1_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__rs_powk1_1", "__rs_powk1_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_powk1_1", "__rs_powk1_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_powk1_1", "__rs_powk1_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__fd_log10_1", "__fd_log10_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fd_log10_1", "__fd_log10_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fd_log10_1", "__fd_log10_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__fs_log10_1", "__fs_log10_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_log10_1", "__fs_log10_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_log10_1", "__fs_log10_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__pd_log10_1", "__pd_log10_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__pd_log10_1", "__pd_log10_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__pd_log10_1", "__pd_log10_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__ps_log10_1", "__ps_log10_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_log10_1", "__ps_log10_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_log10_1", "__ps_log10_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rd_log10_1", "__rd_log10_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rd_log10_1", "__rd_log10_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rd_log10_1", "__rd_log10_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__rs_log10_1", "__rs_log10_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_log10_1", "__rs_log10_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_log10_1", "__rs_log10_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__fd_log_1", "__fd_log_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__fd_log_1", "__fd_log_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fd_log_1", "__fd_log_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__fs_log_1", "__fs_log_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_log_1", "__fs_log_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_log_1", "__fs_log_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__pd_log_1", "__pd_log_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__pd_log_1", "__pd_log_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__pd_log_1", "__pd_log_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__ps_log_1", "__ps_log_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_log_1", "__ps_log_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_log_1", "__ps_log_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rd_log_1", "__rd_log_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rd_log_1", "__rd_log_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rd_log_1", "__rd_log_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__rs_log_1", "__rs_log_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_log_1", "__rs_log_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_log_1", "__rs_log_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__fs_exp_1", "__fs_exp_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__fs_exp_1", "__fs_exp_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__fs_exp_1", "__fs_exp_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__pd_exp_1", "__pd_exp_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__pd_exp_1", "__pd_exp_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__pd_exp_1", "__pd_exp_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__ps_exp_1", "__ps_exp_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__ps_exp_1", "__ps_exp_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__ps_exp_1", "__ps_exp_16", FIXED(16))
++
++TLI_DEFINE_VECFUNC("__rd_exp_1", "__rd_exp_2", FIXED(2))
++TLI_DEFINE_VECFUNC("__rd_exp_1", "__rd_exp_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rd_exp_1", "__rd_exp_8", FIXED(8))
++
++TLI_DEFINE_VECFUNC("__rs_exp_1", "__rs_exp_4", FIXED(4))
++TLI_DEFINE_VECFUNC("__rs_exp_1", "__rs_exp_8", FIXED(8))
++TLI_DEFINE_VECFUNC("__rs_exp_1", "__rs_exp_16", FIXED(16))
++
+ #else
+ #error "Must choose which vector library functions are to be defined."
+ #endif
+@@ -929,3 +1519,5 @@ TLI_DEFINE_VECFUNC("tgammaf", "armpl_svtgamma_f32_x", SCALABLE(4), MASKED)
+ #undef TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS
+ #undef TLI_DEFINE_MASSV_VECFUNCS_NAMES
+ #undef TLI_DEFINE_ARMPL_VECFUNCS
++#undef TLI_DEFINE_PGMATH_AARCH64_VECFUNCS
++#undef TLI_DEFINE_PGMATH_X86_VECFUNCS
+diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
+index ecd6dd7b0a4f..e0e64c662f18 100644
+--- a/llvm/include/llvm/IR/DIBuilder.h
++++ b/llvm/include/llvm/IR/DIBuilder.h
+@@ -681,15 +681,22 @@ namespace llvm {
+         DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *File,
+         unsigned LineNo, DIType *Ty, bool IsLocalToUnit, bool isDefined = true,
+         DIExpression *Expr = nullptr, MDNode *Decl = nullptr,
+-        MDTuple *TemplateParams = nullptr, uint32_t AlignInBits = 0,
+-        DINodeArray Annotations = nullptr);
++        MDTuple *TemplateParams = nullptr,
++#ifdef ENABLE_CLASSIC_FLANG
++        DINode::DIFlags Flags = DINode::FlagZero,
++#endif
++        uint32_t AlignInBits = 0, DINodeArray Annotations = nullptr);
+ 
+     /// Identical to createGlobalVariable
+     /// except that the resulting DbgNode is temporary and meant to be RAUWed.
+     DIGlobalVariable *createTempGlobalVariableFwdDecl(
+         DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *File,
+-        unsigned LineNo, DIType *Ty, bool IsLocalToUnit, MDNode *Decl = nullptr,
+-        MDTuple *TemplateParams = nullptr, uint32_t AlignInBits = 0);
++        unsigned LineNo, DIType *Ty, bool isLocalToUnit, MDNode *Decl = nullptr,
++        MDTuple *TemplateParams = nullptr,
++#ifdef ENABLE_CLASSIC_FLANG
++        DINode::DIFlags Flags = DINode::FlagZero,
++#endif
++        uint32_t AlignInBits = 0);
+ 
+     /// Create a new descriptor for an auto variable.  This is a local variable
+     /// that is not a subprogram parameter.
+@@ -820,6 +827,19 @@ namespace llvm {
+                                      StringRef Name, DIFile *File,
+                                      unsigned LineNo);
+ 
++#ifdef ENABLE_CLASSIC_FLANG
++    /// Create common block entry for a Fortran common block
++    /// \param Scope       Scope of this common block
++    /// \param Name        The name of this common block
++    /// \param File        The file this common block is defined
++    /// \param LineNo      Line number
++    /// \param VarList     List of variables that a located in common block
++    /// \param AlignInBits Common block alignment
++    DICommonBlock *createCommonBlock(DIScope *Scope, DIGlobalVariable *decl,
++                                     StringRef Name, DIFile *File,
++                                     unsigned LineNo, uint32_t AlignInBits = 0);
++
++#endif
+     /// This creates new descriptor for a namespace with the specified
+     /// parent scope.
+     /// \param Scope       Namespace scope
+diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
+index 656122405209..9bd86172a4c0 100644
+--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
++++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
+@@ -3062,12 +3062,14 @@ class DIGlobalVariable : public DIVariable {
+ 
+   bool IsLocalToUnit;
+   bool IsDefinition;
++  DIFlags Flags;
+ 
+   DIGlobalVariable(LLVMContext &C, StorageType Storage, unsigned Line,
+-                   bool IsLocalToUnit, bool IsDefinition, uint32_t AlignInBits,
+-                   ArrayRef<Metadata *> Ops)
++                   bool IsLocalToUnit, bool IsDefinition, DIFlags Flags,
++                   uint32_t AlignInBits, ArrayRef<Metadata *> Ops)
+       : DIVariable(C, DIGlobalVariableKind, Storage, Line, Ops, AlignInBits),
+-        IsLocalToUnit(IsLocalToUnit), IsDefinition(IsDefinition) {}
++        IsLocalToUnit(IsLocalToUnit), IsDefinition(IsDefinition),
++        Flags(Flags) {}
+   ~DIGlobalVariable() = default;
+ 
+   static DIGlobalVariable *
+@@ -3075,12 +3077,12 @@ class DIGlobalVariable : public DIVariable {
+           StringRef LinkageName, DIFile *File, unsigned Line, DIType *Type,
+           bool IsLocalToUnit, bool IsDefinition,
+           DIDerivedType *StaticDataMemberDeclaration, MDTuple *TemplateParams,
+-          uint32_t AlignInBits, DINodeArray Annotations, StorageType Storage,
+-          bool ShouldCreate = true) {
++          DIFlags Flags, uint32_t AlignInBits, DINodeArray Annotations,
++          StorageType Storage, bool ShouldCreate = true) {
+     return getImpl(Context, Scope, getCanonicalMDString(Context, Name),
+                    getCanonicalMDString(Context, LinkageName), File, Line, Type,
+                    IsLocalToUnit, IsDefinition, StaticDataMemberDeclaration,
+-                   cast_or_null<Metadata>(TemplateParams), AlignInBits,
++                   cast_or_null<Metadata>(TemplateParams), Flags, AlignInBits,
+                    Annotations.get(), Storage, ShouldCreate);
+   }
+   static DIGlobalVariable *
+@@ -3088,14 +3090,14 @@ class DIGlobalVariable : public DIVariable {
+           MDString *LinkageName, Metadata *File, unsigned Line, Metadata *Type,
+           bool IsLocalToUnit, bool IsDefinition,
+           Metadata *StaticDataMemberDeclaration, Metadata *TemplateParams,
+-          uint32_t AlignInBits, Metadata *Annotations, StorageType Storage,
+-          bool ShouldCreate = true);
++          DIFlags Flags, uint32_t AlignInBits, Metadata *Annotations,
++          StorageType Storage, bool ShouldCreate = true);
+ 
+   TempDIGlobalVariable cloneImpl() const {
+     return getTemporary(getContext(), getScope(), getName(), getLinkageName(),
+                         getFile(), getLine(), getType(), isLocalToUnit(),
+                         isDefinition(), getStaticDataMemberDeclaration(),
+-                        getTemplateParams(), getAlignInBits(),
++                        getTemplateParams(), getFlags(), getAlignInBits(),
+                         getAnnotations());
+   }
+ 
+@@ -3105,22 +3107,26 @@ public:
+       (DIScope * Scope, StringRef Name, StringRef LinkageName, DIFile *File,
+        unsigned Line, DIType *Type, bool IsLocalToUnit, bool IsDefinition,
+        DIDerivedType *StaticDataMemberDeclaration, MDTuple *TemplateParams,
+-       uint32_t AlignInBits, DINodeArray Annotations),
++       DIFlags Flags, uint32_t AlignInBits, DINodeArray Annotations),
+       (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, IsDefinition,
+-       StaticDataMemberDeclaration, TemplateParams, AlignInBits, Annotations))
++       StaticDataMemberDeclaration, TemplateParams, Flags, AlignInBits,
++       Annotations))
+   DEFINE_MDNODE_GET(
+       DIGlobalVariable,
+       (Metadata * Scope, MDString *Name, MDString *LinkageName, Metadata *File,
+        unsigned Line, Metadata *Type, bool IsLocalToUnit, bool IsDefinition,
+        Metadata *StaticDataMemberDeclaration, Metadata *TemplateParams,
+-       uint32_t AlignInBits, Metadata *Annotations),
++       DIFlags Flags, uint32_t AlignInBits, Metadata *Annotations),
+       (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, IsDefinition,
+-       StaticDataMemberDeclaration, TemplateParams, AlignInBits, Annotations))
++       StaticDataMemberDeclaration, TemplateParams, Flags, AlignInBits,
++       Annotations))
+ 
+   TempDIGlobalVariable clone() const { return cloneImpl(); }
+ 
+   bool isLocalToUnit() const { return IsLocalToUnit; }
+   bool isDefinition() const { return IsDefinition; }
++  DIFlags getFlags() const { return Flags; }
++  bool isArtificial() const { return getFlags() & FlagArtificial; }
+   StringRef getDisplayName() const { return getStringOperand(4); }
+   StringRef getLinkageName() const { return getStringOperand(5); }
+   DIDerivedType *getStaticDataMemberDeclaration() const {
+diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
+index 05fa67d0bbf1..a6593f6b3757 100644
+--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
++++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
+@@ -30,6 +30,10 @@ static cl::opt<TargetLibraryInfoImpl::VectorLibrary> ClVectorLibrary(
+                           "GLIBC Vector Math library"),
+                clEnumValN(TargetLibraryInfoImpl::MASSV, "MASSV",
+                           "IBM MASS vector library"),
++#ifdef ENABLE_CLASSIC_FLANG
++               clEnumValN(TargetLibraryInfoImpl::PGMATH, "PGMATH",
++                          "PGI math library"),
++#endif
+                clEnumValN(TargetLibraryInfoImpl::SVML, "SVML",
+                           "Intel SVML library"),
+                clEnumValN(TargetLibraryInfoImpl::SLEEFGNUABI, "sleefgnuabi",
+@@ -867,14 +871,14 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
+   TLI.addVectorizableFunctionsFromVecLib(ClVectorLibrary, T);
+ }
+ 
+-TargetLibraryInfoImpl::TargetLibraryInfoImpl() {
++TargetLibraryInfoImpl::TargetLibraryInfoImpl() : T(Triple()) {
+   // Default to everything being available.
+   memset(AvailableArray, -1, sizeof(AvailableArray));
+ 
+-  initialize(*this, Triple(), StandardNames);
++  initialize(*this, T, StandardNames);
+ }
+ 
+-TargetLibraryInfoImpl::TargetLibraryInfoImpl(const Triple &T) {
++TargetLibraryInfoImpl::TargetLibraryInfoImpl(const Triple &T) : T(T) {
+   // Default to everything being available.
+   memset(AvailableArray, -1, sizeof(AvailableArray));
+ 
+@@ -886,7 +890,7 @@ TargetLibraryInfoImpl::TargetLibraryInfoImpl(const TargetLibraryInfoImpl &TLI)
+       ShouldExtI32Return(TLI.ShouldExtI32Return),
+       ShouldSignExtI32Param(TLI.ShouldSignExtI32Param),
+       ShouldSignExtI32Return(TLI.ShouldSignExtI32Return),
+-      SizeOfInt(TLI.SizeOfInt) {
++      SizeOfInt(TLI.SizeOfInt), T(TLI.T) {
+   memcpy(AvailableArray, TLI.AvailableArray, sizeof(AvailableArray));
+   VectorDescs = TLI.VectorDescs;
+   ScalarDescs = TLI.ScalarDescs;
+@@ -898,7 +902,7 @@ TargetLibraryInfoImpl::TargetLibraryInfoImpl(TargetLibraryInfoImpl &&TLI)
+       ShouldExtI32Return(TLI.ShouldExtI32Return),
+       ShouldSignExtI32Param(TLI.ShouldSignExtI32Param),
+       ShouldSignExtI32Return(TLI.ShouldSignExtI32Return),
+-      SizeOfInt(TLI.SizeOfInt) {
++      SizeOfInt(TLI.SizeOfInt), T(TLI.T) {
+   std::move(std::begin(TLI.AvailableArray), std::end(TLI.AvailableArray),
+             AvailableArray);
+   VectorDescs = TLI.VectorDescs;
+@@ -912,6 +916,7 @@ TargetLibraryInfoImpl &TargetLibraryInfoImpl::operator=(const TargetLibraryInfoI
+   ShouldSignExtI32Param = TLI.ShouldSignExtI32Param;
+   ShouldSignExtI32Return = TLI.ShouldSignExtI32Return;
+   SizeOfInt = TLI.SizeOfInt;
++  T = TLI.T;
+   memcpy(AvailableArray, TLI.AvailableArray, sizeof(AvailableArray));
+   return *this;
+ }
+@@ -923,6 +928,7 @@ TargetLibraryInfoImpl &TargetLibraryInfoImpl::operator=(TargetLibraryInfoImpl &&
+   ShouldSignExtI32Param = TLI.ShouldSignExtI32Param;
+   ShouldSignExtI32Return = TLI.ShouldSignExtI32Return;
+   SizeOfInt = TLI.SizeOfInt;
++  T = TLI.T;
+   std::move(std::begin(TLI.AvailableArray), std::end(TLI.AvailableArray),
+             AvailableArray);
+   return *this;
+@@ -1234,6 +1240,28 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
+     }
+     break;
+   }
++#ifdef ENABLE_CLASSIC_FLANG
++  // NOTE: All routines listed here are not available on all the architectures.
++  // Based on the size of vector registers available and the size of data, the
++  // vector width should be chosen correctly.
++  case PGMATH: {
++    if (T.getArch() == Triple::aarch64) {
++      const VecDesc VecFuncs[] = {
++      #define TLI_DEFINE_PGMATH_AARCH64_VECFUNCS
++      #include "llvm/Analysis/VecFuncs.def"
++      };
++      addVectorizableFunctions(VecFuncs);
++    } else if (T.getArch() == Triple::x86_64) {
++      const VecDesc VecFuncs[] = {
++      #define TLI_DEFINE_PGMATH_X86_VECFUNCS
++      #include "llvm/Analysis/VecFuncs.def"
++      };
++      addVectorizableFunctions(VecFuncs);
++    }
++    break;
++  }
++#endif
++
+   case NoLibrary:
+     break;
+   }
+diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
+index 5f0d1a76de79..d7eb34e3d148 100644
+--- a/llvm/lib/AsmParser/LLParser.cpp
++++ b/llvm/lib/AsmParser/LLParser.cpp
+@@ -5405,6 +5405,22 @@ bool LLParser::parseDITemplateValueParameter(MDNode *&Result, bool IsDistinct) {
+ ///                         isDefinition: true, templateParams: !3,
+ ///                         declaration: !4, align: 8)
+ bool LLParser::parseDIGlobalVariable(MDNode *&Result, bool IsDistinct) {
++#ifdef ENABLE_CLASSIC_FLANG
++#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
++  OPTIONAL(name, MDStringField, (/* AllowEmpty */ true));                      \
++  OPTIONAL(scope, MDField, );                                                  \
++  OPTIONAL(linkageName, MDStringField, );                                      \
++  OPTIONAL(file, MDField, );                                                   \
++  OPTIONAL(line, LineField, );                                                 \
++  OPTIONAL(type, MDField, );                                                   \
++  OPTIONAL(isLocal, MDBoolField, );                                            \
++  OPTIONAL(isDefinition, MDBoolField, (true));                                 \
++  OPTIONAL(templateParams, MDField, );                                         \
++  OPTIONAL(declaration, MDField, );                                            \
++  OPTIONAL(flags, DIFlagField, );                                              \
++  OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));                           \
++  OPTIONAL(annotations, MDField, );
++#else
+ #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
+   OPTIONAL(name, MDStringField, (/* AllowEmpty */ false));                     \
+   OPTIONAL(scope, MDField, );                                                  \
+@@ -5416,8 +5432,10 @@ bool LLParser::parseDIGlobalVariable(MDNode *&Result, bool IsDistinct) {
+   OPTIONAL(isDefinition, MDBoolField, (true));                                 \
+   OPTIONAL(templateParams, MDField, );                                         \
+   OPTIONAL(declaration, MDField, );                                            \
++  OPTIONAL(flags, DIFlagField, );                                              \
+   OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));                           \
+   OPTIONAL(annotations, MDField, );
++#endif
+   PARSE_MD_FIELDS();
+ #undef VISIT_MD_FIELDS
+ 
+@@ -5425,8 +5443,8 @@ bool LLParser::parseDIGlobalVariable(MDNode *&Result, bool IsDistinct) {
+       GET_OR_DISTINCT(DIGlobalVariable,
+                       (Context, scope.Val, name.Val, linkageName.Val, file.Val,
+                        line.Val, type.Val, isLocal.Val, isDefinition.Val,
+-                       declaration.Val, templateParams.Val, align.Val,
+-                       annotations.Val));
++                       declaration.Val, templateParams.Val, flags.Val,
++                       align.Val, annotations.Val));
+   return false;
+ }
+ 
+diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+index 0a9a80688a41..c21e5e5dba97 100644
+--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
++++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+@@ -1979,25 +1979,43 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
+     break;
+   }
+   case bitc::METADATA_GLOBAL_VAR: {
+-    if (Record.size() < 11 || Record.size() > 13)
++    if (Record.size() < 11 || Record.size() > 14)
+       return error("Invalid record");
+ 
+     IsDistinct = Record[0] & 1;
+     unsigned Version = Record[0] >> 1;
+ 
+-    if (Version == 2) {
++    if (Version == 3) {
++      // Add support for DIFlags
++      Metadata *Annotations = nullptr;
++      if (Record.size() > 13)
++        Annotations = getMDOrNull(Record[13]);
++
++      MetadataList.assignValue(
++          GET_OR_DISTINCT(
++              DIGlobalVariable,
++              (Context, getMDOrNull(Record[1]), getMDString(Record[2]),
++               getMDString(Record[3]), getMDOrNull(Record[4]), Record[5],
++               getDITypeRefOrNull(Record[6]), Record[7], Record[8],
++               getMDOrNull(Record[9]), getMDOrNull(Record[10]),
++               static_cast<DINode::DIFlags>(Record[11]), Record[12],
++               Annotations)),
++          NextMetadataNo);
++
++      NextMetadataNo++;
++    } else if (Version == 2) {
+       Metadata *Annotations = nullptr;
+       if (Record.size() > 12)
+         Annotations = getMDOrNull(Record[12]);
+ 
+       MetadataList.assignValue(
+-          GET_OR_DISTINCT(DIGlobalVariable,
+-                          (Context, getMDOrNull(Record[1]),
+-                           getMDString(Record[2]), getMDString(Record[3]),
+-                           getMDOrNull(Record[4]), Record[5],
+-                           getDITypeRefOrNull(Record[6]), Record[7], Record[8],
+-                           getMDOrNull(Record[9]), getMDOrNull(Record[10]),
+-                           Record[11], Annotations)),
++          GET_OR_DISTINCT(
++              DIGlobalVariable,
++              (Context, getMDOrNull(Record[1]), getMDString(Record[2]),
++               getMDString(Record[3]), getMDOrNull(Record[4]), Record[5],
++               getDITypeRefOrNull(Record[6]), Record[7], Record[8],
++               getMDOrNull(Record[9]), getMDOrNull(Record[10]),
++               DINode::FlagZero, Record[11], Annotations)),
+           NextMetadataNo);
+ 
+       NextMetadataNo++;
+@@ -2010,7 +2028,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
+               (Context, getMDOrNull(Record[1]), getMDString(Record[2]),
+                getMDString(Record[3]), getMDOrNull(Record[4]), Record[5],
+                getDITypeRefOrNull(Record[6]), Record[7], Record[8],
+-               getMDOrNull(Record[10]), nullptr, Record[11], nullptr)),
++               getMDOrNull(Record[10]), nullptr, DINode::FlagZero, Record[11],
++               nullptr)),
+           NextMetadataNo);
+ 
+       NextMetadataNo++;
+@@ -2043,7 +2062,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
+           (Context, getMDOrNull(Record[1]), getMDString(Record[2]),
+            getMDString(Record[3]), getMDOrNull(Record[4]), Record[5],
+            getDITypeRefOrNull(Record[6]), Record[7], Record[8],
+-           getMDOrNull(Record[10]), nullptr, AlignInBits, nullptr));
++           getMDOrNull(Record[10]), nullptr, DINode::FlagZero, AlignInBits,
++           nullptr));
+ 
+       DIGlobalVariableExpression *DGVE = nullptr;
+       if (Attach || Expr)
+diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+index 9416c7f5a03e..013e7ce2d425 100644
+--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
++++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+@@ -2014,7 +2014,7 @@ void ModuleBitcodeWriter::writeDITemplateValueParameter(
+ void ModuleBitcodeWriter::writeDIGlobalVariable(
+     const DIGlobalVariable *N, SmallVectorImpl<uint64_t> &Record,
+     unsigned Abbrev) {
+-  const uint64_t Version = 2 << 1;
++  const uint64_t Version = 3 << 1;
+   Record.push_back((uint64_t)N->isDistinct() | Version);
+   Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+@@ -2026,6 +2026,7 @@ void ModuleBitcodeWriter::writeDIGlobalVariable(
+   Record.push_back(N->isDefinition());
+   Record.push_back(VE.getMetadataOrNullID(N->getStaticDataMemberDeclaration()));
+   Record.push_back(VE.getMetadataOrNullID(N->getTemplateParams()));
++  Record.push_back(N->getFlags());
+   Record.push_back(N->getAlignInBits());
+   Record.push_back(VE.getMetadataOrNullID(N->getAnnotations().get()));
+ 
+diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+index 726aba18bb80..ee8be3921ab7 100644
+--- a/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
++++ b/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+@@ -115,6 +115,29 @@ class DbgValueLoc {
+   SmallVector<DbgValueLocEntry, 2> ValueLocEntries;
+ 
+   bool IsVariadic;
++  /// Type of entry that this represents.
++  enum EntryType {
++    E_Location,
++    E_Integer,
++    E_ConstantFP,
++    E_ConstantInt,
++    E_TargetIndexLocation
++  };
++  enum EntryType EntryKind;
++
++  /// Either a constant,
++  union {
++    int64_t Int;
++    const ConstantFP *CFP;
++    const ConstantInt *CIP;
++  } Constant;
++
++  union {
++    /// Or a location in the machine frame.
++    MachineLocation Loc;
++    /// Or a location from target specific location.
++    TargetIndexLocation TIL;
++  };
+ 
+ public:
+   DbgValueLoc(const DIExpression *Expr, ArrayRef<DbgValueLocEntry> Locs)
+@@ -139,6 +162,37 @@ public:
+     assert(((Expr && Expr->isValid()) || !Loc.isLocation()) &&
+            "DBG_VALUE with a machine location must have a valid expression.");
+   }
++  DbgValueLoc(const DIExpression *Expr, int64_t i)
++      : Expression(Expr), EntryKind(E_Integer) {
++    Constant.Int = i;
++  }
++  DbgValueLoc(const DIExpression *Expr, const ConstantFP *CFP)
++      : Expression(Expr), EntryKind(E_ConstantFP) {
++    Constant.CFP = CFP;
++  }
++  DbgValueLoc(const DIExpression *Expr, const ConstantInt *CIP)
++      : Expression(Expr), EntryKind(E_ConstantInt) {
++    Constant.CIP = CIP;
++  }
++  DbgValueLoc(const DIExpression *Expr, MachineLocation Loc)
++      : Expression(Expr), EntryKind(E_Location), Loc(Loc) {
++    assert(cast<DIExpression>(Expr)->isValid());
++  }
++  DbgValueLoc(const DIExpression *Expr, TargetIndexLocation Loc)
++      : Expression(Expr), EntryKind(E_TargetIndexLocation), TIL(Loc) {}
++
++  bool isLocation() const { return EntryKind == E_Location; }
++  bool isTargetIndexLocation() const {
++    return EntryKind == E_TargetIndexLocation;
++  }
++  bool isInt() const { return EntryKind == E_Integer; }
++  bool isConstantFP() const { return EntryKind == E_ConstantFP; }
++  bool isConstantInt() const { return EntryKind == E_ConstantInt; }
++  int64_t getInt() const { return Constant.Int; }
++  const ConstantFP *getConstantFP() const { return Constant.CFP; }
++  const ConstantInt *getConstantInt() const { return Constant.CIP; }
++  MachineLocation getLoc() const { return Loc; }
++  TargetIndexLocation getTargetIndexLocation() const { return TIL; }
+ 
+   bool isFragment() const { return getExpression()->isFragment(); }
+   bool isEntryVal() const { return getExpression()->isEntryValue(); }
+diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.cpp
+index 8c6109880afc..fbfcf65a34ec 100644
+--- a/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.cpp
++++ b/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.cpp
+@@ -38,6 +38,11 @@ void DebugLocStream::finalizeEntry() {
+ }
+ 
+ DebugLocStream::ListBuilder::~ListBuilder() {
++#ifdef ENABLE_CLASSIC_FLANG
++  if (Finalized)
++    return;
++  Finalized = true;
++#endif
+   if (!Locs.finalizeList(Asm))
+     return;
+   V.initializeDbgValue(&MI);
+diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h b/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h
+index a96bdd034918..0600f4f09d5e 100644
+--- a/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h
++++ b/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h
+@@ -158,12 +158,18 @@ class DebugLocStream::ListBuilder {
+   DbgVariable &V;
+   const MachineInstr &MI;
+   size_t ListIndex;
++#ifdef ENABLE_CLASSIC_FLANG
++  bool Finalized;
++#endif
+   std::optional<uint8_t> TagOffset;
+ 
+ public:
+   ListBuilder(DebugLocStream &Locs, DwarfCompileUnit &CU, AsmPrinter &Asm,
+               DbgVariable &V, const MachineInstr &MI)
+       : Locs(Locs), Asm(Asm), V(V), MI(MI), ListIndex(Locs.startList(&CU)),
++#ifdef ENABLE_CLASSIC_FLANG
++        Finalized(false),
++#endif
+         TagOffset(std::nullopt) {}
+ 
+   void setTagOffset(uint8_t TO) {
+diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+index 58ed21379d29..78ff0d351492 100644
+--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
++++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+@@ -185,6 +185,9 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
+   else
+     addGlobalName(GV->getName(), *VariableDIE, DeclContext);
+ 
++  if (GV->isArtificial())
++    addFlag(*VariableDIE, dwarf::DW_AT_artificial);
++
+   addAnnotation(*VariableDIE, GV->getAnnotations());
+ 
+   if (uint32_t AlignInBytes = GV->getAlignInBytes())
+diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+index 1af4b643eb17..e526614792c7 100644
+--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
++++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+@@ -284,6 +284,8 @@ struct SymbolCU {
+   DwarfCompileUnit *CU;
+ };
+ 
++class DummyDwarfExpression;
++
+ /// The kind of accelerator tables we should emit.
+ enum class AccelTableKind {
+   Default, ///< Platform default.
+@@ -437,6 +439,8 @@ private:
+   /// Map for tracking Fortran deferred CHARACTER lengths.
+   DenseMap<const DIStringType *, unsigned> StringTypeLocMap;
+ 
++  DenseMap<const DIVariable*,const DIType*> VariableInDependentType;
++
+   AddressPool AddrPool;
+ 
+   /// Accelerator tables.
+diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
+index be4a3ed79d88..c47dd4664ea6 100644
+--- a/llvm/lib/IR/AsmWriter.cpp
++++ b/llvm/lib/IR/AsmWriter.cpp
+@@ -2281,6 +2281,7 @@ static void writeDIGlobalVariable(raw_ostream &Out, const DIGlobalVariable *N,
+   Printer.printBool("isDefinition", N->isDefinition());
+   Printer.printMetadata("declaration", N->getRawStaticDataMemberDeclaration());
+   Printer.printMetadata("templateParams", N->getRawTemplateParams());
++  Printer.printDIFlags("flags", N->getFlags());
+   Printer.printInt("align", N->getAlignInBits());
+   Printer.printMetadata("annotations", N->getRawAnnotations());
+   Out << ")";
+diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
+index 1ce8c17f8a88..af6ebf702165 100644
+--- a/llvm/lib/IR/DIBuilder.cpp
++++ b/llvm/lib/IR/DIBuilder.cpp
+@@ -725,14 +725,14 @@ DIGlobalVariableExpression *DIBuilder::createGlobalVariableExpression(
+     DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *F,
+     unsigned LineNumber, DIType *Ty, bool IsLocalToUnit, bool isDefined,
+     DIExpression *Expr, MDNode *Decl, MDTuple *TemplateParams,
+-    uint32_t AlignInBits, DINodeArray Annotations) {
++    DINode::DIFlags Flags, uint32_t AlignInBits, DINodeArray Annotations) {
+   checkGlobalVariableScope(Context);
+ 
+   auto *GV = DIGlobalVariable::getDistinct(
+       VMContext, cast_or_null<DIScope>(Context), Name, LinkageName, F,
+       LineNumber, Ty, IsLocalToUnit, isDefined,
+-      cast_or_null<DIDerivedType>(Decl), TemplateParams, AlignInBits,
+-      Annotations);
++      cast_or_null<DIDerivedType>(Decl), TemplateParams, Flags,
++      AlignInBits, Annotations);
+   if (!Expr)
+     Expr = createExpression();
+   auto *N = DIGlobalVariableExpression::get(VMContext, GV, Expr);
+@@ -743,14 +743,14 @@ DIGlobalVariableExpression *DIBuilder::createGlobalVariableExpression(
+ DIGlobalVariable *DIBuilder::createTempGlobalVariableFwdDecl(
+     DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *F,
+     unsigned LineNumber, DIType *Ty, bool IsLocalToUnit, MDNode *Decl,
+-    MDTuple *TemplateParams, uint32_t AlignInBits) {
++    MDTuple *TemplateParams, DINode::DIFlags Flags, uint32_t AlignInBits) {
+   checkGlobalVariableScope(Context);
+ 
+   return DIGlobalVariable::getTemporary(
+              VMContext, cast_or_null<DIScope>(Context), Name, LinkageName, F,
+              LineNumber, Ty, IsLocalToUnit, false,
+-             cast_or_null<DIDerivedType>(Decl), TemplateParams, AlignInBits,
+-             nullptr)
++             cast_or_null<DIDerivedType>(Decl), TemplateParams, Flags,
++             AlignInBits, nullptr)
+       .release();
+ }
+ 
+diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
+index 48b5501c55ba..3696beccdd0c 100644
+--- a/llvm/lib/IR/DebugInfo.cpp
++++ b/llvm/lib/IR/DebugInfo.cpp
+@@ -1547,12 +1547,13 @@ LLVMMetadataRef LLVMDIBuilderCreateGlobalVariableExpression(
+     LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+     size_t NameLen, const char *Linkage, size_t LinkLen, LLVMMetadataRef File,
+     unsigned LineNo, LLVMMetadataRef Ty, LLVMBool LocalToUnit,
+-    LLVMMetadataRef Expr, LLVMMetadataRef Decl, uint32_t AlignInBits) {
++    LLVMMetadataRef Expr, LLVMMetadataRef Decl, LLVMDIFlags Flags,
++    uint32_t AlignInBits) {
+   return wrap(unwrap(Builder)->createGlobalVariableExpression(
+       unwrapDI<DIScope>(Scope), {Name, NameLen}, {Linkage, LinkLen},
+       unwrapDI<DIFile>(File), LineNo, unwrapDI<DIType>(Ty), LocalToUnit,
+       true, unwrap<DIExpression>(Expr), unwrapDI<MDNode>(Decl),
+-      nullptr, AlignInBits));
++      nullptr, map_from_llvmDIFlags(Flags), AlignInBits));
+ }
+ 
+ LLVMMetadataRef LLVMDIGlobalVariableExpressionGetVariable(LLVMMetadataRef GVE) {
+@@ -1597,11 +1598,12 @@ LLVMMetadataRef LLVMDIBuilderCreateTempGlobalVariableFwdDecl(
+     LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+     size_t NameLen, const char *Linkage, size_t LnkLen, LLVMMetadataRef File,
+     unsigned LineNo, LLVMMetadataRef Ty, LLVMBool LocalToUnit,
+-    LLVMMetadataRef Decl, uint32_t AlignInBits) {
++    LLVMMetadataRef Decl, LLVMDIFlags Flags, uint32_t AlignInBits) {
+   return wrap(unwrap(Builder)->createTempGlobalVariableFwdDecl(
+       unwrapDI<DIScope>(Scope), {Name, NameLen}, {Linkage, LnkLen},
+       unwrapDI<DIFile>(File), LineNo, unwrapDI<DIType>(Ty), LocalToUnit,
+-      unwrapDI<MDNode>(Decl), nullptr, AlignInBits));
++      unwrapDI<MDNode>(Decl), nullptr, map_from_llvmDIFlags(Flags),
++      AlignInBits));
+ }
+ 
+ LLVMValueRef
+diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
+index 4933b6032688..d599896ee456 100644
+--- a/llvm/lib/IR/DebugInfoMetadata.cpp
++++ b/llvm/lib/IR/DebugInfoMetadata.cpp
+@@ -1258,15 +1258,16 @@ DIGlobalVariable::getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
+                           MDString *LinkageName, Metadata *File, unsigned Line,
+                           Metadata *Type, bool IsLocalToUnit, bool IsDefinition,
+                           Metadata *StaticDataMemberDeclaration,
+-                          Metadata *TemplateParams, uint32_t AlignInBits,
+-                          Metadata *Annotations, StorageType Storage,
+-                          bool ShouldCreate) {
++                          Metadata *TemplateParams, DIFlags Flags,
++                          uint32_t AlignInBits, Metadata *Annotations,
++                          StorageType Storage, bool ShouldCreate) {
+   assert(isCanonical(Name) && "Expected canonical MDString");
+   assert(isCanonical(LinkageName) && "Expected canonical MDString");
+   DEFINE_GETIMPL_LOOKUP(
+       DIGlobalVariable,
+       (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, IsDefinition,
+-       StaticDataMemberDeclaration, TemplateParams, AlignInBits, Annotations));
++       StaticDataMemberDeclaration, TemplateParams, Flags, AlignInBits,
++       Annotations));
+   Metadata *Ops[] = {Scope,
+                      Name,
+                      File,
+@@ -1277,7 +1278,8 @@ DIGlobalVariable::getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
+                      TemplateParams,
+                      Annotations};
+   DEFINE_GETIMPL_STORE(DIGlobalVariable,
+-                       (Line, IsLocalToUnit, IsDefinition, AlignInBits), Ops);
++                       (Line, IsLocalToUnit, IsDefinition, Flags, AlignInBits),
++                       Ops);
+ }
+ 
+ DILocalVariable *
+diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
+index 4cc3f8da6b75..8a621725f55e 100644
+--- a/llvm/lib/IR/LLVMContextImpl.h
++++ b/llvm/lib/IR/LLVMContextImpl.h
+@@ -1055,6 +1055,7 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
+   bool IsDefinition;
+   Metadata *StaticDataMemberDeclaration;
+   Metadata *TemplateParams;
++  unsigned Flags;
+   uint32_t AlignInBits;
+   Metadata *Annotations;
+ 
+@@ -1062,20 +1063,21 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
+                 Metadata *File, unsigned Line, Metadata *Type,
+                 bool IsLocalToUnit, bool IsDefinition,
+                 Metadata *StaticDataMemberDeclaration, Metadata *TemplateParams,
++                unsigned Flags,
+                 uint32_t AlignInBits, Metadata *Annotations)
+       : Scope(Scope), Name(Name), LinkageName(LinkageName), File(File),
+         Line(Line), Type(Type), IsLocalToUnit(IsLocalToUnit),
+         IsDefinition(IsDefinition),
+         StaticDataMemberDeclaration(StaticDataMemberDeclaration),
+-        TemplateParams(TemplateParams), AlignInBits(AlignInBits),
+-        Annotations(Annotations) {}
++        TemplateParams(TemplateParams), Flags(Flags),
++        AlignInBits(AlignInBits), Annotations(Annotations) {}
+   MDNodeKeyImpl(const DIGlobalVariable *N)
+       : Scope(N->getRawScope()), Name(N->getRawName()),
+         LinkageName(N->getRawLinkageName()), File(N->getRawFile()),
+         Line(N->getLine()), Type(N->getRawType()),
+         IsLocalToUnit(N->isLocalToUnit()), IsDefinition(N->isDefinition()),
+         StaticDataMemberDeclaration(N->getRawStaticDataMemberDeclaration()),
+-        TemplateParams(N->getRawTemplateParams()),
++        TemplateParams(N->getRawTemplateParams()), Flags(N->getFlags()),
+         AlignInBits(N->getAlignInBits()), Annotations(N->getRawAnnotations()) {}
+ 
+   bool isKeyOf(const DIGlobalVariable *RHS) const {
+@@ -1087,6 +1089,7 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
+            StaticDataMemberDeclaration ==
+                RHS->getRawStaticDataMemberDeclaration() &&
+            TemplateParams == RHS->getRawTemplateParams() &&
++           Flags == RHS->getFlags() &&
+            AlignInBits == RHS->getAlignInBits() &&
+            Annotations == RHS->getRawAnnotations();
+   }
+@@ -1101,7 +1104,7 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
+     // TODO: make hashing work fine with such situations
+     return hash_combine(Scope, Name, LinkageName, File, Line, Type,
+                         IsLocalToUnit, IsDefinition, /* AlignInBits, */
+-                        StaticDataMemberDeclaration, Annotations);
++                        StaticDataMemberDeclaration, Flags, Annotations);
+   }
+ };
+ 
+diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+index a7b1953ce81c..136132d7e65a 100644
+--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
++++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+@@ -1052,8 +1052,8 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
+           SP, CounterPtr->getName(), /*LinkageName=*/StringRef(), SP->getFile(),
+           /*LineNo=*/0, DB.createUnspecifiedType("Profile Data Type"),
+           CounterPtr->hasLocalLinkage(), /*IsDefined=*/true, /*Expr=*/nullptr,
+-          /*Decl=*/nullptr, /*TemplateParams=*/nullptr, /*AlignInBits=*/0,
+-          Annotations);
++          /*Decl=*/nullptr, /*TemplateParams=*/nullptr,
++          /*Flags=*/DINode::FlagZero, /*AlignInBits=*/0, Annotations);
+       CounterPtr->addDebugInfo(DICounter);
+       DB.finalize();
+     } else {
+diff --git a/llvm/test/Assembler/invalid-diglobalvariable-empty-name.ll b/llvm/test/Assembler/invalid-diglobalvariable-empty-name.ll
+index a4e69f3c8b75..d3c476a03198 100644
+--- a/llvm/test/Assembler/invalid-diglobalvariable-empty-name.ll
++++ b/llvm/test/Assembler/invalid-diglobalvariable-empty-name.ll
+@@ -1,4 +1,5 @@
+ ; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
++; UNSUPPORTED: classic_flang
+ 
+ ; CHECK: <stdin>:[[@LINE+1]]:30: error: 'name' cannot be empty
+ !0 = !DIGlobalVariable(name: "")
+diff --git a/llvm/test/DebugInfo/Generic/fortran-subprogram-at.ll b/llvm/test/DebugInfo/Generic/fortran-subprogram-at.ll
+new file mode 100644
+index 000000000000..988c388fe218
+--- /dev/null
++++ b/llvm/test/DebugInfo/Generic/fortran-subprogram-at.ll
+@@ -0,0 +1,24 @@
++; Test for DIFlagPure, DIFlagElement and DIFlagRecursive. These three
++; DIFlags are used to attach DW_AT_pure, DW_AT_element, and DW_AT_recursive
++; attributes to DW_TAG_subprogram DIEs.
++
++; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
++; CHECK: !DISubprogram({{.*}}, spFlags: DISPFlagDefinition | DISPFlagPure | DISPFlagElemental | DISPFlagRecursive,
++
++!llvm.module.flags = !{!0, !1}
++!llvm.dbg.cu = !{!2}
++
++define void @subprgm() !dbg !6 {
++L:
++  ret void
++}
++
++!0 = !{i32 2, !"Dwarf Version", i32 2}
++!1 = !{i32 1, !"Debug Info Version", i32 3}
++!2 = distinct !DICompileUnit(language: DW_LANG_Fortran90, file: !3, producer: "Flang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4, globals: !4, imports: !4)
++!3 = !DIFile(filename: "fortran-subprogram-at.f", directory: "/")
++!4 = !{}
++!5 = !DIBasicType(name: "real", size: 32, align: 32, encoding: DW_ATE_float)
++!6 = distinct !DISubprogram(name: "subprgm", scope: !2, file: !3, line: 256, type: !7, scopeLine: 256, spFlags: DISPFlagDefinition | DISPFlagPure | DISPFlagElemental | DISPFlagRecursive, unit: !2)
++!7 = !DISubroutineType(types: !8)
++!8 = !{null, !5}
+diff --git a/llvm/test/DebugInfo/Generic/more-subprogram-attr.ll b/llvm/test/DebugInfo/Generic/more-subprogram-attr.ll
+new file mode 100644
+index 000000000000..0533cf6b2367
+--- /dev/null
++++ b/llvm/test/DebugInfo/Generic/more-subprogram-attr.ll
+@@ -0,0 +1,38 @@
++; REQUIRES: object-emission
++
++; RUN: %llc_dwarf -O0 -filetype=obj < %s > %t
++; RUN: llvm-dwarfdump -v -debug-info %t | FileCheck %s
++
++; Make sure we're emitting DW_AT_{pure,elemental,recursive}.
++; CHECK: DW_TAG_subprogram
++; CHECK-NOT: {{DW_TAG|NULL}}
++; CHECK:   DW_AT_name {{.*}} "main"
++; CHECK-NOT: {{DW_TAG|NULL}}
++; CHECK:   DW_AT_pure [DW_FORM_flag_present] (true)
++; CHECK:   DW_AT_elemental [DW_FORM_flag_present] (true)
++; CHECK:   DW_AT_recursive [DW_FORM_flag_present] (true)
++
++define dso_local i32 @main() !dbg !7 {
++entry:
++  %retval = alloca i32, align 4
++  store i32 0, i32* %retval, align 4
++  ret i32 0, !dbg !12
++}
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4, !5}
++!llvm.ident = !{!6}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
++!1 = !DIFile(filename: "x.c", directory: "/tmp")
++!2 = !{}
++!3 = !{i32 2, !"Dwarf Version", i32 4}
++!4 = !{i32 2, !"Debug Info Version", i32 3}
++!5 = !{i32 1, !"wchar_size", i32 4}
++!6 = !{!"clang"}
++!7 = distinct !DISubprogram(name: "main", scope: !8, file: !8, line: 1, type: !9, scopeLine: 2, spFlags: DISPFlagDefinition | DISPFlagPure | DISPFlagElemental | DISPFlagRecursive, unit: !0, retainedNodes: !2)
++!8 = !DIFile(filename: "x.c", directory: "/tmp")
++!9 = !DISubroutineType(types: !10)
++!10 = !{!11}
++!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
++!12 = !DILocation(line: 3, column: 3, scope: !7)
+diff --git a/llvm/test/DebugInfo/X86/DICommonBlock.ll b/llvm/test/DebugInfo/X86/DICommonBlock.ll
+new file mode 100644
+index 000000000000..6cfb7a90640d
+--- /dev/null
++++ b/llvm/test/DebugInfo/X86/DICommonBlock.ll
+@@ -0,0 +1,36 @@
++; ModuleID = 'none.f90'
++; RUN: llc %s -o %t -filetype=obj
++; RUN: llvm-dwarfdump -debug-info %t | FileCheck %s
++; CHECK: DW_TAG_common_block
++
++target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
++target triple = "x86_64-apple-macosx"
++
++@common_a = common global [32 x i8] zeroinitializer, align 8, !dbg !13
++
++define i32 @subr() !dbg !9 {
++    %1 = getelementptr inbounds [32 x i8], [32 x i8]* @common_a, i64 0, i32 8
++    %2 = bitcast i8* %1 to i32*
++    %3 = load i32, i32* %2
++    ret i32 %3
++}
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!6, !7}
++!llvm.ident = !{!8}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_Fortran90, file: !1, producer: "PGI Fortran", isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, retainedTypes: !14, globals: !2)
++!1 = !DIFile(filename: "none.f90", directory: "/not/here/")
++!2 = !{!13}
++!3 = !{}
++!4 = !DIGlobalVariable(name: "common /a/", scope: !5, file: !1, line: 4, isLocal: false, isDefinition: true, type: !12)
++!5 = !DICommonBlock(scope: !9, declaration: !4, name: "a", file: !1, line: 4)
++!6 = !{i32 2, !"Dwarf Version", i32 4}
++!7 = !{i32 2, !"Debug Info Version", i32 3}
++!8 = !{!"PGI Fortran"}
++!9 = distinct !DISubprogram(name: "subrtn", scope: !0, file: !1, line: 1, type: !10, isLocal: false, isDefinition: true, unit: !0)
++!10 = !DISubroutineType(types: !11)
++!11 = !{!12, !12}
++!12 = !DIBasicType(name: "int", size: 32)
++!13 = !DIGlobalVariableExpression(var: !4, expr: !DIExpression())
++!14 = !{!12, !10}
+diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
+index 4114bf7f54b2..9cc8520960c2 100644
+--- a/llvm/test/lit.cfg.py
++++ b/llvm/test/lit.cfg.py
+@@ -645,3 +645,6 @@ if "aix" in config.target_triple:
+ # "OBJECT_MODE" to 'any' by default on AIX OS.
+ if "system-aix" in config.available_features:
+     config.environment["OBJECT_MODE"] = "any"
++
++if config.use_classic_flang:
++    config.available_features.add("classic_flang")
+diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
+index 57ee2100dfb7..fc7ab6536309 100644
+--- a/llvm/test/lit.site.cfg.py.in
++++ b/llvm/test/lit.site.cfg.py.in
+@@ -61,6 +61,7 @@ config.expensive_checks = @LLVM_ENABLE_EXPENSIVE_CHECKS@
+ config.reverse_iteration = @LLVM_ENABLE_REVERSE_ITERATION@
+ config.dxil_tests = @LLVM_INCLUDE_DXIL_TESTS@
+ config.have_llvm_driver = @LLVM_TOOL_LLVM_DRIVER_BUILD@
++config.use_classic_flang = @LLVM_ENABLE_CLASSIC_FLANG@
+ 
+ import lit.llvm
+ lit.llvm.initialize(lit_config, config)
+diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
+index a3e41be12e95..906c96f1c24b 100644
+--- a/llvm/tools/llvm-c-test/debuginfo.c
++++ b/llvm/tools/llvm-c-test/debuginfo.c
+@@ -64,7 +64,7 @@ int llvm_test_dibuilder(void) {
+       LLVMDIBuilderCreateConstantValueExpression(DIB, 0);
+   LLVMDIBuilderCreateGlobalVariableExpression(
+       DIB, Module, "globalClass", 11, "", 0, File, 1, ClassTy, true,
+-      GlobalClassValueExpr, NULL, 0);
++      GlobalClassValueExpr, NULL, LLVMDIFlagZero, 0);
+ 
+   LLVMMetadataRef Int64Ty =
+       LLVMDIBuilderCreateBasicType(DIB, "Int64", 5, 64, 0, LLVMDIFlagZero);
+@@ -75,7 +75,7 @@ int llvm_test_dibuilder(void) {
+       LLVMDIBuilderCreateConstantValueExpression(DIB, 0);
+   LLVMDIBuilderCreateGlobalVariableExpression(
+       DIB, Module, "global", 6, "", 0, File, 1, Int64TypeDef, true,
+-      GlobalVarValueExpr, NULL, 0);
++      GlobalVarValueExpr, NULL, LLVMDIFlagZero, 0);
+ 
+   LLVMMetadataRef NameSpace =
+       LLVMDIBuilderCreateNameSpace(DIB, Module, "NameSpace", 9, false);
+diff --git a/llvm/unittests/IR/MetadataTest.cpp b/llvm/unittests/IR/MetadataTest.cpp
+index 5342360109d0..4bce26851d2f 100644
+--- a/llvm/unittests/IR/MetadataTest.cpp
++++ b/llvm/unittests/IR/MetadataTest.cpp
+@@ -2896,12 +2896,13 @@ TEST_F(DIGlobalVariableTest, get) {
+   DIDerivedType *StaticDataMemberDeclaration =
+       cast<DIDerivedType>(getDerivedType());
+ 
++  DINode::DIFlags Flags = static_cast<DINode::DIFlags>(7);
+   uint32_t AlignInBits = 8;
+ 
+   auto *N = DIGlobalVariable::get(
+       Context, Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit,
+-      IsDefinition, StaticDataMemberDeclaration, templateParams, AlignInBits,
+-      nullptr);
++      IsDefinition, StaticDataMemberDeclaration, templateParams, Flags,
++      AlignInBits, nullptr);
+ 
+   EXPECT_EQ(dwarf::DW_TAG_variable, N->getTag());
+   EXPECT_EQ(Scope, N->getScope());
+@@ -2914,57 +2915,66 @@ TEST_F(DIGlobalVariableTest, get) {
+   EXPECT_EQ(IsDefinition, N->isDefinition());
+   EXPECT_EQ(StaticDataMemberDeclaration, N->getStaticDataMemberDeclaration());
+   EXPECT_EQ(templateParams, N->getTemplateParams());
++  EXPECT_EQ(Flags, N->getFlags());
+   EXPECT_EQ(AlignInBits, N->getAlignInBits());
+   EXPECT_EQ(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                      Line, Type, IsLocalToUnit, IsDefinition,
+                                      StaticDataMemberDeclaration,
+-                                     templateParams, AlignInBits, nullptr));
++                                     templateParams, Flags, AlignInBits,
++                                     nullptr));
+ 
+   EXPECT_NE(N, DIGlobalVariable::get(
+                    Context, getSubprogram(), Name, LinkageName, File, Line,
+                    Type, IsLocalToUnit, IsDefinition,
+-                   StaticDataMemberDeclaration, templateParams, AlignInBits,
+-                   nullptr));
++                   StaticDataMemberDeclaration, templateParams, Flags,
++                   AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, "other", LinkageName, File,
+                                      Line, Type, IsLocalToUnit, IsDefinition,
+                                      StaticDataMemberDeclaration,
+-                                     templateParams, AlignInBits, nullptr));
++                                     templateParams, Flags, AlignInBits,
++                                     nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, "other", File, Line,
+                                      Type, IsLocalToUnit, IsDefinition,
+                                      StaticDataMemberDeclaration,
+-                                     templateParams, AlignInBits, nullptr));
++                                     templateParams, Flags, AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName,
+                                      getFile(), Line, Type, IsLocalToUnit,
+                                      IsDefinition, StaticDataMemberDeclaration,
+-                                     templateParams, AlignInBits, nullptr));
++                                     templateParams, Flags, AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                      Line + 1, Type, IsLocalToUnit,
+                                      IsDefinition, StaticDataMemberDeclaration,
+-                                     templateParams, AlignInBits, nullptr));
++                                     templateParams, Flags, AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                      Line, getDerivedType(), IsLocalToUnit,
+                                      IsDefinition, StaticDataMemberDeclaration,
+-                                     templateParams, AlignInBits, nullptr));
++                                     templateParams, Flags, AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                      Line, Type, !IsLocalToUnit, IsDefinition,
+                                      StaticDataMemberDeclaration,
+-                                     templateParams, AlignInBits, nullptr));
++                                     templateParams, Flags, AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                      Line, Type, IsLocalToUnit, !IsDefinition,
+                                      StaticDataMemberDeclaration,
+-                                     templateParams, AlignInBits, nullptr));
++                                     templateParams, Flags, AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                      Line, Type, IsLocalToUnit, IsDefinition,
+                                      cast<DIDerivedType>(getDerivedType()),
+-                                     templateParams, AlignInBits, nullptr));
++                                     templateParams, Flags, AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                      Line, Type, IsLocalToUnit, IsDefinition,
+                                      StaticDataMemberDeclaration, nullptr,
++                                     Flags, AlignInBits, nullptr));
++  EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
++                                     Line, Type, IsLocalToUnit, IsDefinition,
++                                     StaticDataMemberDeclaration,
++                                     templateParams,
++                                     static_cast<DINode::DIFlags>(Flags + 1),
+                                      AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                      Line, Type, IsLocalToUnit, IsDefinition,
+                                      StaticDataMemberDeclaration,
+-                                     templateParams, (AlignInBits << 1),
++                                     templateParams, Flags, (AlignInBits << 1),
+                                      nullptr));
+ 
+   TempDIGlobalVariable Temp = N->clone();
+@@ -2987,16 +2997,17 @@ TEST_F(DIGlobalVariableExpressionTest, get) {
+   auto *Expr2 = DIExpression::get(Context, {1, 2, 3});
+   DIDerivedType *StaticDataMemberDeclaration =
+       cast<DIDerivedType>(getDerivedType());
++  DINode::DIFlags Flags = static_cast<DINode::DIFlags>(7);
+   uint32_t AlignInBits = 8;
+ 
+   auto *Var = DIGlobalVariable::get(
+       Context, Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit,
+-      IsDefinition, StaticDataMemberDeclaration, templateParams, AlignInBits,
+-      nullptr);
++      IsDefinition, StaticDataMemberDeclaration, templateParams, Flags,
++      AlignInBits, nullptr);
+   auto *Var2 = DIGlobalVariable::get(
+       Context, Scope, "other", LinkageName, File, Line, Type, IsLocalToUnit,
+-      IsDefinition, StaticDataMemberDeclaration, templateParams, AlignInBits,
+-      nullptr);
++      IsDefinition, StaticDataMemberDeclaration, templateParams, Flags,
++      AlignInBits, nullptr);
+   auto *N = DIGlobalVariableExpression::get(Context, Var, Expr);
+ 
+   EXPECT_EQ(Var, N->getVariable());
+diff --git a/llvm/utils/lit/lit/TestingConfig.py b/llvm/utils/lit/lit/TestingConfig.py
+index 76fd66502009..399b74c4f00d 100644
+--- a/llvm/utils/lit/lit/TestingConfig.py
++++ b/llvm/utils/lit/lit/TestingConfig.py
+@@ -26,6 +26,7 @@ class TestingConfig(object):
+             "SYSTEMROOT",
+             "TERM",
+             "CLANG",
++            "FLANG",
+             "LLDB",
+             "LD_PRELOAD",
+             "LLVM_SYMBOLIZER_PATH",
+diff --git a/llvm/utils/lit/lit/llvm/config.py b/llvm/utils/lit/lit/llvm/config.py
+index 16cc2968034b..d8dec6160071 100644
+--- a/llvm/utils/lit/lit/llvm/config.py
++++ b/llvm/utils/lit/lit/llvm/config.py
+@@ -500,6 +500,8 @@ class LLVMConfig(object):
+         just-built or optionally an installed clang, and add a set of standard
+         substitutions useful to any test suite that makes use of clang.
+ 
++        Also sets up use of flang
++
+         """
+         # Clear some environment variables that might affect Clang.
+         #
+@@ -631,6 +633,14 @@ class LLVMConfig(object):
+             self.add_tool_substitutions(tool_substitutions)
+             self.config.substitutions.append(("%resource_dir", builtin_include_dir))
+ 
++        self.config.flang = self.use_llvm_tool(
++            'flang', search_env='FLANG', required=required)
++        if self.config.flang:
++            tool_substitutions = [
++                ToolSubst('%flang', command=self.config.flang)
++                ]
++            self.add_tool_substitutions(tool_substitutions)
++
+         self.config.substitutions.append(
+             (
+                 "%itanium_abi_triple",
+-- 
+2.24.3 (Apple Git-128)
+
diff --git a/0018-Fix-declaration-definition-mismatch-for-classic-flang.patch b/0018-Fix-declaration-definition-mismatch-for-classic-flang.patch
new file mode 100644
index 0000000..e33514d
--- /dev/null
+++ b/0018-Fix-declaration-definition-mismatch-for-classic-flang.patch
@@ -0,0 +1,1041 @@
+From d6d17d8f0362ac47100be32f0d9cb31fd66a1060 Mon Sep 17 00:00:00 2001
+From: luofeng14 <luofeng13@huawei.com>
+Date: Wed, 17 Apr 2024 14:26:07 +0800
+Subject: [PATCH] Fix declaration definition mismatch for classic flang
+
+---
+ llvm/cmake/modules/TableGen.cmake             |  4 +
+ llvm/include/llvm-c/DebugInfo.h               | 11 ++-
+ .../include/llvm/Analysis/TargetLibraryInfo.h |  5 +
+ llvm/include/llvm/Analysis/VecFuncs.def       |  8 ++
+ llvm/include/llvm/IR/DebugInfoMetadata.h      | 65 ++++++++++---
+ llvm/lib/Analysis/TargetLibraryInfo.cpp       | 24 +++++
+ llvm/lib/AsmParser/LLParser.cpp               |  6 +-
+ llvm/lib/Bitcode/Reader/MetadataLoader.cpp    | 44 ++++++---
+ llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     |  6 ++
+ llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h   |  4 +
+ .../CodeGen/AsmPrinter/DwarfCompileUnit.cpp   |  4 +-
+ llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h      |  8 +-
+ llvm/lib/IR/AsmWriter.cpp                     |  2 +
+ llvm/lib/IR/DIBuilder.cpp                     | 21 +++-
+ llvm/lib/IR/DebugInfo.cpp                     | 22 ++++-
+ llvm/lib/IR/DebugInfoMetadata.cpp             | 19 +++-
+ llvm/lib/IR/LLVMContextImpl.h                 | 22 ++++-
+ .../Instrumentation/InstrProfiling.cpp        |  5 +-
+ llvm/tools/llvm-c-test/debuginfo.c            | 12 ++-
+ llvm/unittests/IR/MetadataTest.cpp            | 97 +++++++++++++++----
+ llvm/utils/lit/lit/llvm/config.py             | 17 ++--
+ 21 files changed, 321 insertions(+), 85 deletions(-)
+
+diff --git a/llvm/cmake/modules/TableGen.cmake b/llvm/cmake/modules/TableGen.cmake
+index 7fd6628ef55d..d4d2c06c051b 100644
+--- a/llvm/cmake/modules/TableGen.cmake
++++ b/llvm/cmake/modules/TableGen.cmake
+@@ -76,6 +76,10 @@ function(tablegen project ofn)
+     set(tblgen_change_flag "--write-if-changed")
+   endif()
+ 
++  if (LLVM_ENABLE_CLASSIC_FLANG)
++    list(APPEND tblgen_change_flag "-DENABLE_CLASSIC_FLANG")
++  endif()
++
+   if (NOT LLVM_ENABLE_WARNINGS)
+     list(APPEND LLVM_TABLEGEN_FLAGS "-no-warn-on-unused-template-args")
+   endif()
+diff --git a/llvm/include/llvm-c/DebugInfo.h b/llvm/include/llvm-c/DebugInfo.h
+index 09d584c24711..0201bac4349d 100644
+--- a/llvm/include/llvm-c/DebugInfo.h
++++ b/llvm/include/llvm-c/DebugInfo.h
+@@ -1148,7 +1148,10 @@ LLVMMetadataRef LLVMDIBuilderCreateGlobalVariableExpression(
+     LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+     size_t NameLen, const char *Linkage, size_t LinkLen, LLVMMetadataRef File,
+     unsigned LineNo, LLVMMetadataRef Ty, LLVMBool LocalToUnit,
+-    LLVMMetadataRef Expr, LLVMMetadataRef Decl, LLVMDIFlags Flags,
++    LLVMMetadataRef Expr, LLVMMetadataRef Decl,
++#ifdef ENABLE_CLASSIC_FLANG
++    LLVMDIFlags Flags,
++#endif
+     uint32_t AlignInBits);
+ 
+ 
+@@ -1247,7 +1250,11 @@ LLVMMetadataRef LLVMDIBuilderCreateTempGlobalVariableFwdDecl(
+     LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+     size_t NameLen, const char *Linkage, size_t LnkLen, LLVMMetadataRef File,
+     unsigned LineNo, LLVMMetadataRef Ty, LLVMBool LocalToUnit,
+-    LLVMMetadataRef Decl, LLVMDIFlags Flags, uint32_t AlignInBits);
++    LLVMMetadataRef Decl,
++#ifdef ENABLE_CLASSIC_FLANG
++    LLVMDIFlags Flags,
++#endif
++    uint32_t AlignInBits);
+ 
+ /**
+  * Insert a new llvm.dbg.declare intrinsic call before the given instruction.
+diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+index 490252cd018a..6805c6535189 100644
+--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
++++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+@@ -22,6 +22,9 @@ namespace llvm {
+ template <typename T> class ArrayRef;
+ class Function;
+ class Module;
++#ifndef ENABLE_CLASSIC_FLANG
++class Triple;
++#endif
+ 
+ /// Describes a possible vectorization of a function.
+ /// Function 'VectorFnName' is equivalent to 'ScalarFnName' vectorized
+@@ -80,7 +83,9 @@ class TargetLibraryInfoImpl {
+   bool isValidProtoForLibFunc(const FunctionType &FTy, LibFunc F,
+                               const Module &M) const;
+ 
++#ifdef ENABLE_CLASSIC_FLANG
+   Triple T;
++#endif
+ 
+ public:
+   /// List of known vector-functions libraries.
+diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
+index d1712d158423..679e28057d6e 100644
+--- a/llvm/include/llvm/Analysis/VecFuncs.def
++++ b/llvm/include/llvm/Analysis/VecFuncs.def
+@@ -910,6 +910,8 @@ TLI_DEFINE_VECFUNC("tgamma", "armpl_svtgamma_f64_x",  SCALABLE(2), MASKED)
+ TLI_DEFINE_VECFUNC("tgammaf", "armpl_svtgamma_f32_x", SCALABLE(4), MASKED)
+ 
+ #elif defined(TLI_DEFINE_PGMATH_AARCH64_VECFUNCS)
++
++#ifdef ENABLE_CLASSIC_FLANG
+ // Classic flang libpgmath library's Vector Functions for AArch64
+ 
+ TLI_DEFINE_VECFUNC("__fd_sin_1", "__fd_sin_2", FIXED(2))
+@@ -1079,8 +1081,11 @@ TLI_DEFINE_VECFUNC("__ps_exp_1", "__ps_exp_4", FIXED(4))
+ 
+ TLI_DEFINE_VECFUNC("__rd_exp_1", "__rd_exp_2", FIXED(2))
+ TLI_DEFINE_VECFUNC("__rs_exp_1", "__rs_exp_4", FIXED(4))
++#endif
+ 
+ #elif defined(TLI_DEFINE_PGMATH_X86_VECFUNCS)
++
++#ifdef ENABLE_CLASSIC_FLANG
+ // Classic flang libpgmath library's Vector Functions for X86
+ 
+ TLI_DEFINE_VECFUNC("__fd_sin_1", "__fd_sin_2", FIXED(2))
+@@ -1498,6 +1503,7 @@ TLI_DEFINE_VECFUNC("__rd_exp_1", "__rd_exp_8", FIXED(8))
+ TLI_DEFINE_VECFUNC("__rs_exp_1", "__rs_exp_4", FIXED(4))
+ TLI_DEFINE_VECFUNC("__rs_exp_1", "__rs_exp_8", FIXED(8))
+ TLI_DEFINE_VECFUNC("__rs_exp_1", "__rs_exp_16", FIXED(16))
++#endif
+ 
+ #else
+ #error "Must choose which vector library functions are to be defined."
+@@ -1519,5 +1525,7 @@ TLI_DEFINE_VECFUNC("__rs_exp_1", "__rs_exp_16", FIXED(16))
+ #undef TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS
+ #undef TLI_DEFINE_MASSV_VECFUNCS_NAMES
+ #undef TLI_DEFINE_ARMPL_VECFUNCS
++#ifdef ENABLE_CLASSIC_FLANG
+ #undef TLI_DEFINE_PGMATH_AARCH64_VECFUNCS
+ #undef TLI_DEFINE_PGMATH_X86_VECFUNCS
++#endif
+diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
+index 9bd86172a4c0..277c2ddd4dd9 100644
+--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
++++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
+@@ -3062,14 +3062,23 @@ class DIGlobalVariable : public DIVariable {
+ 
+   bool IsLocalToUnit;
+   bool IsDefinition;
++#ifdef ENABLE_CLASSIC_FLANG
+   DIFlags Flags;
++#endif
+ 
+   DIGlobalVariable(LLVMContext &C, StorageType Storage, unsigned Line,
+-                   bool IsLocalToUnit, bool IsDefinition, DIFlags Flags,
++                   bool IsLocalToUnit, bool IsDefinition,
++#ifdef ENABLE_CLASSIC_FLANG
++                   DIFlags Flags,
++#endif
+                    uint32_t AlignInBits, ArrayRef<Metadata *> Ops)
+       : DIVariable(C, DIGlobalVariableKind, Storage, Line, Ops, AlignInBits),
+-        IsLocalToUnit(IsLocalToUnit), IsDefinition(IsDefinition),
+-        Flags(Flags) {}
++#ifdef ENABLE_CLASSIC_FLANG
++        IsLocalToUnit(IsLocalToUnit), IsDefinition(IsDefinition), Flags(Flags) {}
++#else
++        IsLocalToUnit(IsLocalToUnit), IsDefinition(IsDefinition) {}
++#endif
++
+   ~DIGlobalVariable() = default;
+ 
+   static DIGlobalVariable *
+@@ -3077,28 +3086,40 @@ class DIGlobalVariable : public DIVariable {
+           StringRef LinkageName, DIFile *File, unsigned Line, DIType *Type,
+           bool IsLocalToUnit, bool IsDefinition,
+           DIDerivedType *StaticDataMemberDeclaration, MDTuple *TemplateParams,
+-          DIFlags Flags, uint32_t AlignInBits, DINodeArray Annotations,
++#ifdef ENABLE_CLASSIC_FLANG
++          DIFlags Flags,
++#endif
++          uint32_t AlignInBits, DINodeArray Annotations,
+           StorageType Storage, bool ShouldCreate = true) {
+     return getImpl(Context, Scope, getCanonicalMDString(Context, Name),
+                    getCanonicalMDString(Context, LinkageName), File, Line, Type,
+                    IsLocalToUnit, IsDefinition, StaticDataMemberDeclaration,
+-                   cast_or_null<Metadata>(TemplateParams), Flags, AlignInBits,
+-                   Annotations.get(), Storage, ShouldCreate);
++                   cast_or_null<Metadata>(TemplateParams),
++#ifdef ENABLE_CLASSIC_FLANG
++                   Flags,
++#endif
++                   AlignInBits, Annotations.get(), Storage, ShouldCreate);
+   }
+   static DIGlobalVariable *
+   getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
+           MDString *LinkageName, Metadata *File, unsigned Line, Metadata *Type,
+           bool IsLocalToUnit, bool IsDefinition,
+           Metadata *StaticDataMemberDeclaration, Metadata *TemplateParams,
+-          DIFlags Flags, uint32_t AlignInBits, Metadata *Annotations,
++#ifdef ENABLE_CLASSIC_FLANG
++          DIFlags Flags,
++#endif
++          uint32_t AlignInBits, Metadata *Annotations,
+           StorageType Storage, bool ShouldCreate = true);
+ 
+   TempDIGlobalVariable cloneImpl() const {
+     return getTemporary(getContext(), getScope(), getName(), getLinkageName(),
+                         getFile(), getLine(), getType(), isLocalToUnit(),
+                         isDefinition(), getStaticDataMemberDeclaration(),
+-                        getTemplateParams(), getFlags(), getAlignInBits(),
+-                        getAnnotations());
++                        getTemplateParams(),
++#ifdef ENABLE_CLASSIC_FLANG
++                        getFlags(),
++#endif
++                        getAlignInBits(), getAnnotations());
+   }
+ 
+ public:
+@@ -3107,26 +3128,40 @@ public:
+       (DIScope * Scope, StringRef Name, StringRef LinkageName, DIFile *File,
+        unsigned Line, DIType *Type, bool IsLocalToUnit, bool IsDefinition,
+        DIDerivedType *StaticDataMemberDeclaration, MDTuple *TemplateParams,
+-       DIFlags Flags, uint32_t AlignInBits, DINodeArray Annotations),
++#ifdef ENABLE_CLASSIC_FLANG
++       DIFlags Flags,
++#endif
++       uint32_t AlignInBits, DINodeArray Annotations),
+       (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, IsDefinition,
+-       StaticDataMemberDeclaration, TemplateParams, Flags, AlignInBits,
+-       Annotations))
++       StaticDataMemberDeclaration, TemplateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++       Flags,
++#endif
++       AlignInBits, Annotations))
+   DEFINE_MDNODE_GET(
+       DIGlobalVariable,
+       (Metadata * Scope, MDString *Name, MDString *LinkageName, Metadata *File,
+        unsigned Line, Metadata *Type, bool IsLocalToUnit, bool IsDefinition,
+        Metadata *StaticDataMemberDeclaration, Metadata *TemplateParams,
+-       DIFlags Flags, uint32_t AlignInBits, Metadata *Annotations),
++#ifdef ENABLE_CLASSIC_FLANG
++       DIFlags Flags,
++#endif
++       uint32_t AlignInBits, Metadata *Annotations),
+       (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, IsDefinition,
+-       StaticDataMemberDeclaration, TemplateParams, Flags, AlignInBits,
+-       Annotations))
++       StaticDataMemberDeclaration, TemplateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++       Flags,
++#endif
++       AlignInBits, Annotations))
+ 
+   TempDIGlobalVariable clone() const { return cloneImpl(); }
+ 
+   bool isLocalToUnit() const { return IsLocalToUnit; }
+   bool isDefinition() const { return IsDefinition; }
++#ifdef ENABLE_CLASSIC_FLANG
+   DIFlags getFlags() const { return Flags; }
+   bool isArtificial() const { return getFlags() & FlagArtificial; }
++#endif
+   StringRef getDisplayName() const { return getStringOperand(4); }
+   StringRef getLinkageName() const { return getStringOperand(5); }
+   DIDerivedType *getStaticDataMemberDeclaration() const {
+diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
+index a6593f6b3757..a9d69af5373c 100644
+--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
++++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
+@@ -871,14 +871,26 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
+   TLI.addVectorizableFunctionsFromVecLib(ClVectorLibrary, T);
+ }
+ 
++#ifdef ENABLE_CLASSIC_FLANG
+ TargetLibraryInfoImpl::TargetLibraryInfoImpl() : T(Triple()) {
++#else
++TargetLibraryInfoImpl::TargetLibraryInfoImpl() {
++#endif
+   // Default to everything being available.
+   memset(AvailableArray, -1, sizeof(AvailableArray));
+ 
++#ifdef ENABLE_CLASSIC_FLANG
+   initialize(*this, T, StandardNames);
++#else
++  initialize(*this, Triple(), StandardNames);
++#endif
+ }
+ 
++#ifdef ENABLE_CLASSIC_FLANG
+ TargetLibraryInfoImpl::TargetLibraryInfoImpl(const Triple &T) : T(T) {
++#else
++TargetLibraryInfoImpl::TargetLibraryInfoImpl(const Triple &T) {
++#endif
+   // Default to everything being available.
+   memset(AvailableArray, -1, sizeof(AvailableArray));
+ 
+@@ -890,7 +902,11 @@ TargetLibraryInfoImpl::TargetLibraryInfoImpl(const TargetLibraryInfoImpl &TLI)
+       ShouldExtI32Return(TLI.ShouldExtI32Return),
+       ShouldSignExtI32Param(TLI.ShouldSignExtI32Param),
+       ShouldSignExtI32Return(TLI.ShouldSignExtI32Return),
++#ifdef ENABLE_CLASSIC_FLANG
+       SizeOfInt(TLI.SizeOfInt), T(TLI.T) {
++#else
++      SizeOfInt(TLI.SizeOfInt) {
++#endif
+   memcpy(AvailableArray, TLI.AvailableArray, sizeof(AvailableArray));
+   VectorDescs = TLI.VectorDescs;
+   ScalarDescs = TLI.ScalarDescs;
+@@ -902,7 +918,11 @@ TargetLibraryInfoImpl::TargetLibraryInfoImpl(TargetLibraryInfoImpl &&TLI)
+       ShouldExtI32Return(TLI.ShouldExtI32Return),
+       ShouldSignExtI32Param(TLI.ShouldSignExtI32Param),
+       ShouldSignExtI32Return(TLI.ShouldSignExtI32Return),
++#ifdef ENABLE_CLASSIC_FLANG
+       SizeOfInt(TLI.SizeOfInt), T(TLI.T) {
++#else
++      SizeOfInt(TLI.SizeOfInt) {
++#endif
+   std::move(std::begin(TLI.AvailableArray), std::end(TLI.AvailableArray),
+             AvailableArray);
+   VectorDescs = TLI.VectorDescs;
+@@ -916,7 +936,9 @@ TargetLibraryInfoImpl &TargetLibraryInfoImpl::operator=(const TargetLibraryInfoI
+   ShouldSignExtI32Param = TLI.ShouldSignExtI32Param;
+   ShouldSignExtI32Return = TLI.ShouldSignExtI32Return;
+   SizeOfInt = TLI.SizeOfInt;
++#ifdef ENABLE_CLASSIC_FLANG
+   T = TLI.T;
++#endif
+   memcpy(AvailableArray, TLI.AvailableArray, sizeof(AvailableArray));
+   return *this;
+ }
+@@ -928,7 +950,9 @@ TargetLibraryInfoImpl &TargetLibraryInfoImpl::operator=(TargetLibraryInfoImpl &&
+   ShouldSignExtI32Param = TLI.ShouldSignExtI32Param;
+   ShouldSignExtI32Return = TLI.ShouldSignExtI32Return;
+   SizeOfInt = TLI.SizeOfInt;
++#ifdef ENABLE_CLASSIC_FLANG
+   T = TLI.T;
++#endif
+   std::move(std::begin(TLI.AvailableArray), std::end(TLI.AvailableArray),
+             AvailableArray);
+   return *this;
+diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
+index d7eb34e3d148..0a7166bd50b7 100644
+--- a/llvm/lib/AsmParser/LLParser.cpp
++++ b/llvm/lib/AsmParser/LLParser.cpp
+@@ -5432,7 +5432,6 @@ bool LLParser::parseDIGlobalVariable(MDNode *&Result, bool IsDistinct) {
+   OPTIONAL(isDefinition, MDBoolField, (true));                                 \
+   OPTIONAL(templateParams, MDField, );                                         \
+   OPTIONAL(declaration, MDField, );                                            \
+-  OPTIONAL(flags, DIFlagField, );                                              \
+   OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));                           \
+   OPTIONAL(annotations, MDField, );
+ #endif
+@@ -5443,7 +5442,10 @@ bool LLParser::parseDIGlobalVariable(MDNode *&Result, bool IsDistinct) {
+       GET_OR_DISTINCT(DIGlobalVariable,
+                       (Context, scope.Val, name.Val, linkageName.Val, file.Val,
+                        line.Val, type.Val, isLocal.Val, isDefinition.Val,
+-                       declaration.Val, templateParams.Val, flags.Val,
++                       declaration.Val, templateParams.Val,
++#ifdef ENABLE_CLASSIC_FLANG
++                       flags.Val,
++#endif
+                        align.Val, annotations.Val));
+   return false;
+ }
+diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+index c21e5e5dba97..a33a0587d1c0 100644
+--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
++++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+@@ -1979,12 +1979,16 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
+     break;
+   }
+   case bitc::METADATA_GLOBAL_VAR: {
++#ifdef ENABLE_CLASSIC_FLANG
+     if (Record.size() < 11 || Record.size() > 14)
++#else
++    if (Record.size() < 11 || Record.size() > 13)
++#endif
+       return error("Invalid record");
+ 
+     IsDistinct = Record[0] & 1;
+     unsigned Version = Record[0] >> 1;
+-
++#ifdef ENABLE_CLASSIC_FLANG
+     if (Version == 3) {
+       // Add support for DIFlags
+       Metadata *Annotations = nullptr;
+@@ -1998,24 +2002,30 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
+                getMDString(Record[3]), getMDOrNull(Record[4]), Record[5],
+                getDITypeRefOrNull(Record[6]), Record[7], Record[8],
+                getMDOrNull(Record[9]), getMDOrNull(Record[10]),
+-               static_cast<DINode::DIFlags>(Record[11]), Record[12],
+-               Annotations)),
++               static_cast<DINode::DIFlags>(Record[11]),
++               Record[12], Annotations)),
+           NextMetadataNo);
+ 
+       NextMetadataNo++;
+     } else if (Version == 2) {
++#else
++    if (Version == 2) {
++#endif
+       Metadata *Annotations = nullptr;
+       if (Record.size() > 12)
+         Annotations = getMDOrNull(Record[12]);
+ 
+       MetadataList.assignValue(
+-          GET_OR_DISTINCT(
+-              DIGlobalVariable,
+-              (Context, getMDOrNull(Record[1]), getMDString(Record[2]),
+-               getMDString(Record[3]), getMDOrNull(Record[4]), Record[5],
+-               getDITypeRefOrNull(Record[6]), Record[7], Record[8],
+-               getMDOrNull(Record[9]), getMDOrNull(Record[10]),
+-               DINode::FlagZero, Record[11], Annotations)),
++          GET_OR_DISTINCT(DIGlobalVariable,
++                          (Context, getMDOrNull(Record[1]),
++                           getMDString(Record[2]), getMDString(Record[3]),
++                           getMDOrNull(Record[4]), Record[5],
++                           getDITypeRefOrNull(Record[6]), Record[7], Record[8],
++                           getMDOrNull(Record[9]), getMDOrNull(Record[10]),
++#ifdef ENABLE_CLASSIC_FLANG
++                           DINode::FlagZero,
++#endif
++                           Record[11], Annotations)),
+           NextMetadataNo);
+ 
+       NextMetadataNo++;
+@@ -2028,8 +2038,11 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
+               (Context, getMDOrNull(Record[1]), getMDString(Record[2]),
+                getMDString(Record[3]), getMDOrNull(Record[4]), Record[5],
+                getDITypeRefOrNull(Record[6]), Record[7], Record[8],
+-               getMDOrNull(Record[10]), nullptr, DINode::FlagZero, Record[11],
+-               nullptr)),
++               getMDOrNull(Record[10]), nullptr,
++#ifdef ENABLE_CLASSIC_FLANG
++               DINode::FlagZero,
++#endif
++               Record[11], nullptr)),
+           NextMetadataNo);
+ 
+       NextMetadataNo++;
+@@ -2062,8 +2075,11 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
+           (Context, getMDOrNull(Record[1]), getMDString(Record[2]),
+            getMDString(Record[3]), getMDOrNull(Record[4]), Record[5],
+            getDITypeRefOrNull(Record[6]), Record[7], Record[8],
+-           getMDOrNull(Record[10]), nullptr, DINode::FlagZero, AlignInBits,
+-           nullptr));
++           getMDOrNull(Record[10]), nullptr,
++#ifdef ENABLE_CLASSIC_FLANG
++           DINode::FlagZero,
++#endif
++           AlignInBits, nullptr));
+ 
+       DIGlobalVariableExpression *DGVE = nullptr;
+       if (Attach || Expr)
+diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+index 013e7ce2d425..d5bcd327a9b7 100644
+--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
++++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+@@ -2014,7 +2014,11 @@ void ModuleBitcodeWriter::writeDITemplateValueParameter(
+ void ModuleBitcodeWriter::writeDIGlobalVariable(
+     const DIGlobalVariable *N, SmallVectorImpl<uint64_t> &Record,
+     unsigned Abbrev) {
++#ifdef ENABLE_CLASSIC_FLANG
+   const uint64_t Version = 3 << 1;
++#else
++  const uint64_t Version = 2 << 1;
++#endif
+   Record.push_back((uint64_t)N->isDistinct() | Version);
+   Record.push_back(VE.getMetadataOrNullID(N->getScope()));
+   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+@@ -2026,7 +2030,9 @@ void ModuleBitcodeWriter::writeDIGlobalVariable(
+   Record.push_back(N->isDefinition());
+   Record.push_back(VE.getMetadataOrNullID(N->getStaticDataMemberDeclaration()));
+   Record.push_back(VE.getMetadataOrNullID(N->getTemplateParams()));
++#ifdef ENABLE_CLASSIC_FLANG
+   Record.push_back(N->getFlags());
++#endif
+   Record.push_back(N->getAlignInBits());
+   Record.push_back(VE.getMetadataOrNullID(N->getAnnotations().get()));
+ 
+diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+index ee8be3921ab7..b4fe2295d0b1 100644
+--- a/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
++++ b/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+@@ -115,6 +115,7 @@ class DbgValueLoc {
+   SmallVector<DbgValueLocEntry, 2> ValueLocEntries;
+ 
+   bool IsVariadic;
++#ifdef ENABLE_CLASSIC_FLANG
+   /// Type of entry that this represents.
+   enum EntryType {
+     E_Location,
+@@ -138,6 +139,7 @@ class DbgValueLoc {
+     /// Or a location from target specific location.
+     TargetIndexLocation TIL;
+   };
++#endif
+ 
+ public:
+   DbgValueLoc(const DIExpression *Expr, ArrayRef<DbgValueLocEntry> Locs)
+@@ -162,6 +164,7 @@ public:
+     assert(((Expr && Expr->isValid()) || !Loc.isLocation()) &&
+            "DBG_VALUE with a machine location must have a valid expression.");
+   }
++#ifdef ENABLE_CLASSIC_FLANG
+   DbgValueLoc(const DIExpression *Expr, int64_t i)
+       : Expression(Expr), EntryKind(E_Integer) {
+     Constant.Int = i;
+@@ -193,6 +196,7 @@ public:
+   const ConstantInt *getConstantInt() const { return Constant.CIP; }
+   MachineLocation getLoc() const { return Loc; }
+   TargetIndexLocation getTargetIndexLocation() const { return TIL; }
++#endif
+ 
+   bool isFragment() const { return getExpression()->isFragment(); }
+   bool isEntryVal() const { return getExpression()->isEntryValue(); }
+diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+index 78ff0d351492..4a70d1f07d6e 100644
+--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
++++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+@@ -184,10 +184,10 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
+     addFlag(*VariableDIE, dwarf::DW_AT_declaration);
+   else
+     addGlobalName(GV->getName(), *VariableDIE, DeclContext);
+-
++#ifdef ENABLE_CLASSIC_FLANG
+   if (GV->isArtificial())
+     addFlag(*VariableDIE, dwarf::DW_AT_artificial);
+-
++#endif
+   addAnnotation(*VariableDIE, GV->getAnnotations());
+ 
+   if (uint32_t AlignInBytes = GV->getAlignInBytes())
+diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+index e526614792c7..1f7fe5c382e9 100644
+--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
++++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+@@ -283,9 +283,9 @@ struct SymbolCU {
+   const MCSymbol *Sym;
+   DwarfCompileUnit *CU;
+ };
+-
++#ifdef ENABLE_CLASSIC_FLANG
+ class DummyDwarfExpression;
+-
++#endif
+ /// The kind of accelerator tables we should emit.
+ enum class AccelTableKind {
+   Default, ///< Platform default.
+@@ -438,9 +438,9 @@ private:
+ 
+   /// Map for tracking Fortran deferred CHARACTER lengths.
+   DenseMap<const DIStringType *, unsigned> StringTypeLocMap;
+-
++#ifdef ENABLE_CLASSIC_FLANG
+   DenseMap<const DIVariable*,const DIType*> VariableInDependentType;
+-
++#endif
+   AddressPool AddrPool;
+ 
+   /// Accelerator tables.
+diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
+index c47dd4664ea6..df753b91ff90 100644
+--- a/llvm/lib/IR/AsmWriter.cpp
++++ b/llvm/lib/IR/AsmWriter.cpp
+@@ -2281,7 +2281,9 @@ static void writeDIGlobalVariable(raw_ostream &Out, const DIGlobalVariable *N,
+   Printer.printBool("isDefinition", N->isDefinition());
+   Printer.printMetadata("declaration", N->getRawStaticDataMemberDeclaration());
+   Printer.printMetadata("templateParams", N->getRawTemplateParams());
++#ifdef ENABLE_CLASSIC_FLANG
+   Printer.printDIFlags("flags", N->getFlags());
++#endif
+   Printer.printInt("align", N->getAlignInBits());
+   Printer.printMetadata("annotations", N->getRawAnnotations());
+   Out << ")";
+diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
+index af6ebf702165..41b2acd8661f 100644
+--- a/llvm/lib/IR/DIBuilder.cpp
++++ b/llvm/lib/IR/DIBuilder.cpp
+@@ -725,13 +725,19 @@ DIGlobalVariableExpression *DIBuilder::createGlobalVariableExpression(
+     DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *F,
+     unsigned LineNumber, DIType *Ty, bool IsLocalToUnit, bool isDefined,
+     DIExpression *Expr, MDNode *Decl, MDTuple *TemplateParams,
+-    DINode::DIFlags Flags, uint32_t AlignInBits, DINodeArray Annotations) {
++#ifdef ENABLE_CLASSIC_FLANG
++    DINode::DIFlags Flags,
++#endif
++    uint32_t AlignInBits, DINodeArray Annotations) {
+   checkGlobalVariableScope(Context);
+ 
+   auto *GV = DIGlobalVariable::getDistinct(
+       VMContext, cast_or_null<DIScope>(Context), Name, LinkageName, F,
+       LineNumber, Ty, IsLocalToUnit, isDefined,
+-      cast_or_null<DIDerivedType>(Decl), TemplateParams, Flags,
++      cast_or_null<DIDerivedType>(Decl), TemplateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++      Flags,
++#endif
+       AlignInBits, Annotations);
+   if (!Expr)
+     Expr = createExpression();
+@@ -743,13 +749,20 @@ DIGlobalVariableExpression *DIBuilder::createGlobalVariableExpression(
+ DIGlobalVariable *DIBuilder::createTempGlobalVariableFwdDecl(
+     DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *F,
+     unsigned LineNumber, DIType *Ty, bool IsLocalToUnit, MDNode *Decl,
+-    MDTuple *TemplateParams, DINode::DIFlags Flags, uint32_t AlignInBits) {
++    MDTuple *TemplateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++    DINode::DIFlags Flags,
++#endif
++    uint32_t AlignInBits) {
+   checkGlobalVariableScope(Context);
+ 
+   return DIGlobalVariable::getTemporary(
+              VMContext, cast_or_null<DIScope>(Context), Name, LinkageName, F,
+              LineNumber, Ty, IsLocalToUnit, false,
+-             cast_or_null<DIDerivedType>(Decl), TemplateParams, Flags,
++             cast_or_null<DIDerivedType>(Decl), TemplateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++             Flags,
++#endif
+              AlignInBits, nullptr)
+       .release();
+ }
+diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
+index 3696beccdd0c..f21a8f6e3c10 100644
+--- a/llvm/lib/IR/DebugInfo.cpp
++++ b/llvm/lib/IR/DebugInfo.cpp
+@@ -1547,13 +1547,20 @@ LLVMMetadataRef LLVMDIBuilderCreateGlobalVariableExpression(
+     LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+     size_t NameLen, const char *Linkage, size_t LinkLen, LLVMMetadataRef File,
+     unsigned LineNo, LLVMMetadataRef Ty, LLVMBool LocalToUnit,
+-    LLVMMetadataRef Expr, LLVMMetadataRef Decl, LLVMDIFlags Flags,
++    LLVMMetadataRef Expr, LLVMMetadataRef Decl,
++#ifdef ENABLE_CLASSIC_FLANG
++    LLVMDIFlags Flags,
++#endif
+     uint32_t AlignInBits) {
+   return wrap(unwrap(Builder)->createGlobalVariableExpression(
+       unwrapDI<DIScope>(Scope), {Name, NameLen}, {Linkage, LinkLen},
+       unwrapDI<DIFile>(File), LineNo, unwrapDI<DIType>(Ty), LocalToUnit,
+       true, unwrap<DIExpression>(Expr), unwrapDI<MDNode>(Decl),
+-      nullptr, map_from_llvmDIFlags(Flags), AlignInBits));
++      nullptr,
++#ifdef ENABLE_CLASSIC_FLANG
++      map_from_llvmDIFlags(Flags),
++#endif
++      AlignInBits));
+ }
+ 
+ LLVMMetadataRef LLVMDIGlobalVariableExpressionGetVariable(LLVMMetadataRef GVE) {
+@@ -1598,11 +1605,18 @@ LLVMMetadataRef LLVMDIBuilderCreateTempGlobalVariableFwdDecl(
+     LLVMDIBuilderRef Builder, LLVMMetadataRef Scope, const char *Name,
+     size_t NameLen, const char *Linkage, size_t LnkLen, LLVMMetadataRef File,
+     unsigned LineNo, LLVMMetadataRef Ty, LLVMBool LocalToUnit,
+-    LLVMMetadataRef Decl, LLVMDIFlags Flags, uint32_t AlignInBits) {
++    LLVMMetadataRef Decl,
++#ifdef ENABLE_CLASSIC_FLANG
++    LLVMDIFlags Flags,
++#endif
++    uint32_t AlignInBits) {
+   return wrap(unwrap(Builder)->createTempGlobalVariableFwdDecl(
+       unwrapDI<DIScope>(Scope), {Name, NameLen}, {Linkage, LnkLen},
+       unwrapDI<DIFile>(File), LineNo, unwrapDI<DIType>(Ty), LocalToUnit,
+-      unwrapDI<MDNode>(Decl), nullptr, map_from_llvmDIFlags(Flags),
++      unwrapDI<MDNode>(Decl), nullptr,
++#ifdef ENABLE_CLASSIC_FLANG
++      map_from_llvmDIFlags(Flags),
++#endif
+       AlignInBits));
+ }
+ 
+diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
+index d599896ee456..074529f6e1c6 100644
+--- a/llvm/lib/IR/DebugInfoMetadata.cpp
++++ b/llvm/lib/IR/DebugInfoMetadata.cpp
+@@ -1258,7 +1258,10 @@ DIGlobalVariable::getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
+                           MDString *LinkageName, Metadata *File, unsigned Line,
+                           Metadata *Type, bool IsLocalToUnit, bool IsDefinition,
+                           Metadata *StaticDataMemberDeclaration,
+-                          Metadata *TemplateParams, DIFlags Flags,
++                          Metadata *TemplateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++                          DIFlags Flags,
++#endif
+                           uint32_t AlignInBits, Metadata *Annotations,
+                           StorageType Storage, bool ShouldCreate) {
+   assert(isCanonical(Name) && "Expected canonical MDString");
+@@ -1266,8 +1269,11 @@ DIGlobalVariable::getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
+   DEFINE_GETIMPL_LOOKUP(
+       DIGlobalVariable,
+       (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, IsDefinition,
+-       StaticDataMemberDeclaration, TemplateParams, Flags, AlignInBits,
+-       Annotations));
++       StaticDataMemberDeclaration, TemplateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++       Flags,
++#endif
++       AlignInBits, Annotations));
+   Metadata *Ops[] = {Scope,
+                      Name,
+                      File,
+@@ -1278,8 +1284,11 @@ DIGlobalVariable::getImpl(LLVMContext &Context, Metadata *Scope, MDString *Name,
+                      TemplateParams,
+                      Annotations};
+   DEFINE_GETIMPL_STORE(DIGlobalVariable,
+-                       (Line, IsLocalToUnit, IsDefinition, Flags, AlignInBits),
+-                       Ops);
++                       (Line, IsLocalToUnit, IsDefinition,
++#ifdef ENABLE_CLASSIC_FLANG
++                        Flags,
++#endif
++                        AlignInBits), Ops);
+ }
+ 
+ DILocalVariable *
+diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
+index 8a621725f55e..01faec286b7d 100644
+--- a/llvm/lib/IR/LLVMContextImpl.h
++++ b/llvm/lib/IR/LLVMContextImpl.h
+@@ -1055,7 +1055,9 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
+   bool IsDefinition;
+   Metadata *StaticDataMemberDeclaration;
+   Metadata *TemplateParams;
++#ifdef ENABLE_CLASSIC_FLANG
+   unsigned Flags;
++#endif
+   uint32_t AlignInBits;
+   Metadata *Annotations;
+ 
+@@ -1063,13 +1065,18 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
+                 Metadata *File, unsigned Line, Metadata *Type,
+                 bool IsLocalToUnit, bool IsDefinition,
+                 Metadata *StaticDataMemberDeclaration, Metadata *TemplateParams,
++#ifdef ENABLE_CLASSIC_FLANG
+                 unsigned Flags,
++#endif
+                 uint32_t AlignInBits, Metadata *Annotations)
+       : Scope(Scope), Name(Name), LinkageName(LinkageName), File(File),
+         Line(Line), Type(Type), IsLocalToUnit(IsLocalToUnit),
+         IsDefinition(IsDefinition),
+         StaticDataMemberDeclaration(StaticDataMemberDeclaration),
+-        TemplateParams(TemplateParams), Flags(Flags),
++        TemplateParams(TemplateParams),
++#ifdef ENABLE_CLASSIC_FLANG
++        Flags(Flags),
++#endif
+         AlignInBits(AlignInBits), Annotations(Annotations) {}
+   MDNodeKeyImpl(const DIGlobalVariable *N)
+       : Scope(N->getRawScope()), Name(N->getRawName()),
+@@ -1077,7 +1084,10 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
+         Line(N->getLine()), Type(N->getRawType()),
+         IsLocalToUnit(N->isLocalToUnit()), IsDefinition(N->isDefinition()),
+         StaticDataMemberDeclaration(N->getRawStaticDataMemberDeclaration()),
+-        TemplateParams(N->getRawTemplateParams()), Flags(N->getFlags()),
++        TemplateParams(N->getRawTemplateParams()),
++#ifdef ENABLE_CLASSIC_FLANG
++        Flags(N->getFlags()),
++#endif
+         AlignInBits(N->getAlignInBits()), Annotations(N->getRawAnnotations()) {}
+ 
+   bool isKeyOf(const DIGlobalVariable *RHS) const {
+@@ -1089,7 +1099,9 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
+            StaticDataMemberDeclaration ==
+                RHS->getRawStaticDataMemberDeclaration() &&
+            TemplateParams == RHS->getRawTemplateParams() &&
++#ifdef ENABLE_CLASSIC_FLANG
+            Flags == RHS->getFlags() &&
++#endif
+            AlignInBits == RHS->getAlignInBits() &&
+            Annotations == RHS->getRawAnnotations();
+   }
+@@ -1104,7 +1116,11 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
+     // TODO: make hashing work fine with such situations
+     return hash_combine(Scope, Name, LinkageName, File, Line, Type,
+                         IsLocalToUnit, IsDefinition, /* AlignInBits, */
+-                        StaticDataMemberDeclaration, Flags, Annotations);
++                        StaticDataMemberDeclaration,
++#ifdef ENABLE_CLASSIC_FLANG
++                        Flags,
++#endif
++                        Annotations);
+   }
+ };
+ 
+diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+index 136132d7e65a..343554241da3 100644
+--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
++++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+@@ -1053,7 +1053,10 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
+           /*LineNo=*/0, DB.createUnspecifiedType("Profile Data Type"),
+           CounterPtr->hasLocalLinkage(), /*IsDefined=*/true, /*Expr=*/nullptr,
+           /*Decl=*/nullptr, /*TemplateParams=*/nullptr,
+-          /*Flags=*/DINode::FlagZero, /*AlignInBits=*/0, Annotations);
++#ifdef ENABLE_CLASSIC_FLANG
++          /*Flags=*/DINode::FlagZero,
++#endif
++          /*AlignInBits=*/0, Annotations);
+       CounterPtr->addDebugInfo(DICounter);
+       DB.finalize();
+     } else {
+diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
+index 906c96f1c24b..e1866443e762 100644
+--- a/llvm/tools/llvm-c-test/debuginfo.c
++++ b/llvm/tools/llvm-c-test/debuginfo.c
+@@ -64,7 +64,11 @@ int llvm_test_dibuilder(void) {
+       LLVMDIBuilderCreateConstantValueExpression(DIB, 0);
+   LLVMDIBuilderCreateGlobalVariableExpression(
+       DIB, Module, "globalClass", 11, "", 0, File, 1, ClassTy, true,
+-      GlobalClassValueExpr, NULL, LLVMDIFlagZero, 0);
++      GlobalClassValueExpr, NULL,
++#ifdef ENABLE_CLASSIC_FLANG
++      LLVMDIFlagZero,
++#endif
++      0);
+ 
+   LLVMMetadataRef Int64Ty =
+       LLVMDIBuilderCreateBasicType(DIB, "Int64", 5, 64, 0, LLVMDIFlagZero);
+@@ -75,7 +79,11 @@ int llvm_test_dibuilder(void) {
+       LLVMDIBuilderCreateConstantValueExpression(DIB, 0);
+   LLVMDIBuilderCreateGlobalVariableExpression(
+       DIB, Module, "global", 6, "", 0, File, 1, Int64TypeDef, true,
+-      GlobalVarValueExpr, NULL, LLVMDIFlagZero, 0);
++      GlobalVarValueExpr, NULL,
++#ifdef ENABLE_CLASSIC_FLANG
++      LLVMDIFlagZero,
++#endif
++      0);
+ 
+   LLVMMetadataRef NameSpace =
+       LLVMDIBuilderCreateNameSpace(DIB, Module, "NameSpace", 9, false);
+diff --git a/llvm/unittests/IR/MetadataTest.cpp b/llvm/unittests/IR/MetadataTest.cpp
+index 4bce26851d2f..788d514ad366 100644
+--- a/llvm/unittests/IR/MetadataTest.cpp
++++ b/llvm/unittests/IR/MetadataTest.cpp
+@@ -2895,13 +2895,17 @@ TEST_F(DIGlobalVariableTest, get) {
+   MDTuple *templateParams = getTuple();
+   DIDerivedType *StaticDataMemberDeclaration =
+       cast<DIDerivedType>(getDerivedType());
+-
++#ifdef ENABLE_CLASSIC_FLANG
+   DINode::DIFlags Flags = static_cast<DINode::DIFlags>(7);
++#endif
+   uint32_t AlignInBits = 8;
+ 
+   auto *N = DIGlobalVariable::get(
+       Context, Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit,
+-      IsDefinition, StaticDataMemberDeclaration, templateParams, Flags,
++      IsDefinition, StaticDataMemberDeclaration, templateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++      Flags,
++#endif
+       AlignInBits, nullptr);
+ 
+   EXPECT_EQ(dwarf::DW_TAG_variable, N->getTag());
+@@ -2915,67 +2919,114 @@ TEST_F(DIGlobalVariableTest, get) {
+   EXPECT_EQ(IsDefinition, N->isDefinition());
+   EXPECT_EQ(StaticDataMemberDeclaration, N->getStaticDataMemberDeclaration());
+   EXPECT_EQ(templateParams, N->getTemplateParams());
++#ifdef ENABLE_CLASSIC_FLANG
+   EXPECT_EQ(Flags, N->getFlags());
++#endif
+   EXPECT_EQ(AlignInBits, N->getAlignInBits());
+   EXPECT_EQ(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                      Line, Type, IsLocalToUnit, IsDefinition,
+                                      StaticDataMemberDeclaration,
+-                                     templateParams, Flags, AlignInBits,
+-                                     nullptr));
++                                     templateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++                                     Flags,
++#endif
++                                     AlignInBits, nullptr));
+ 
+   EXPECT_NE(N, DIGlobalVariable::get(
+                    Context, getSubprogram(), Name, LinkageName, File, Line,
+                    Type, IsLocalToUnit, IsDefinition,
+-                   StaticDataMemberDeclaration, templateParams, Flags,
++                   StaticDataMemberDeclaration, templateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++                   Flags,
++#endif
+                    AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, "other", LinkageName, File,
+                                      Line, Type, IsLocalToUnit, IsDefinition,
+                                      StaticDataMemberDeclaration,
+-                                     templateParams, Flags, AlignInBits,
+-                                     nullptr));
++                                     templateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++                                     Flags,
++#endif
++                                     AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, "other", File, Line,
+                                      Type, IsLocalToUnit, IsDefinition,
+                                      StaticDataMemberDeclaration,
+-                                     templateParams, Flags, AlignInBits, nullptr));
++                                     templateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++                                     Flags,
++#endif
++                                     AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName,
+                                      getFile(), Line, Type, IsLocalToUnit,
+                                      IsDefinition, StaticDataMemberDeclaration,
+-                                     templateParams, Flags, AlignInBits, nullptr));
++                                     templateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++                                     Flags,
++#endif
++                                     AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                      Line + 1, Type, IsLocalToUnit,
+                                      IsDefinition, StaticDataMemberDeclaration,
+-                                     templateParams, Flags, AlignInBits, nullptr));
++                                     templateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++                                     Flags,
++#endif
++                                     AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                      Line, getDerivedType(), IsLocalToUnit,
+                                      IsDefinition, StaticDataMemberDeclaration,
+-                                     templateParams, Flags, AlignInBits, nullptr));
++                                     templateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++                                     Flags,
++#endif
++                                     AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                      Line, Type, !IsLocalToUnit, IsDefinition,
+                                      StaticDataMemberDeclaration,
+-                                     templateParams, Flags, AlignInBits, nullptr));
++                                     templateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++                                     Flags,
++#endif
++                                     AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                      Line, Type, IsLocalToUnit, !IsDefinition,
+                                      StaticDataMemberDeclaration,
+-                                     templateParams, Flags, AlignInBits, nullptr));
++                                     templateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++                                     Flags,
++#endif
++                                     AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                      Line, Type, IsLocalToUnit, IsDefinition,
+                                      cast<DIDerivedType>(getDerivedType()),
+-                                     templateParams, Flags, AlignInBits, nullptr));
++                                     templateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++                                     Flags,
++#endif
++                                     AlignInBits, nullptr));
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                      Line, Type, IsLocalToUnit, IsDefinition,
+                                      StaticDataMemberDeclaration, nullptr,
+-                                     Flags, AlignInBits, nullptr));
++#ifdef ENABLE_CLASSIC_FLANG
++                                     Flags,
++#endif
++                                     AlignInBits, nullptr));
++#ifdef ENABLE_CLASSIC_FLANG
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                      Line, Type, IsLocalToUnit, IsDefinition,
+                                      StaticDataMemberDeclaration,
+                                      templateParams,
+                                      static_cast<DINode::DIFlags>(Flags + 1),
+                                      AlignInBits, nullptr));
++#endif
+   EXPECT_NE(N, DIGlobalVariable::get(Context, Scope, Name, LinkageName, File,
+                                      Line, Type, IsLocalToUnit, IsDefinition,
+                                      StaticDataMemberDeclaration,
+-                                     templateParams, Flags, (AlignInBits << 1),
+-                                     nullptr));
++                                     templateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++                                     Flags,
++#endif
++                                     (AlignInBits << 1), nullptr));
+ 
+   TempDIGlobalVariable Temp = N->clone();
+   EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
+@@ -2997,16 +3048,24 @@ TEST_F(DIGlobalVariableExpressionTest, get) {
+   auto *Expr2 = DIExpression::get(Context, {1, 2, 3});
+   DIDerivedType *StaticDataMemberDeclaration =
+       cast<DIDerivedType>(getDerivedType());
++#ifdef ENABLE_CLASSIC_FLANG
+   DINode::DIFlags Flags = static_cast<DINode::DIFlags>(7);
++#endif
+   uint32_t AlignInBits = 8;
+ 
+   auto *Var = DIGlobalVariable::get(
+       Context, Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit,
+-      IsDefinition, StaticDataMemberDeclaration, templateParams, Flags,
++      IsDefinition, StaticDataMemberDeclaration, templateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++      Flags,
++#endif
+       AlignInBits, nullptr);
+   auto *Var2 = DIGlobalVariable::get(
+       Context, Scope, "other", LinkageName, File, Line, Type, IsLocalToUnit,
+-      IsDefinition, StaticDataMemberDeclaration, templateParams, Flags,
++      IsDefinition, StaticDataMemberDeclaration, templateParams,
++#ifdef ENABLE_CLASSIC_FLANG
++      Flags,
++#endif
+       AlignInBits, nullptr);
+   auto *N = DIGlobalVariableExpression::get(Context, Var, Expr);
+ 
+diff --git a/llvm/utils/lit/lit/llvm/config.py b/llvm/utils/lit/lit/llvm/config.py
+index d8dec6160071..69ede49dadc5 100644
+--- a/llvm/utils/lit/lit/llvm/config.py
++++ b/llvm/utils/lit/lit/llvm/config.py
+@@ -632,14 +632,15 @@ class LLVMConfig(object):
+             ]
+             self.add_tool_substitutions(tool_substitutions)
+             self.config.substitutions.append(("%resource_dir", builtin_include_dir))
+-
+-        self.config.flang = self.use_llvm_tool(
+-            'flang', search_env='FLANG', required=required)
+-        if self.config.flang:
+-            tool_substitutions = [
+-                ToolSubst('%flang', command=self.config.flang)
+-                ]
+-            self.add_tool_substitutions(tool_substitutions)
++        use_classic_flang = getattr(self.config, "use_classic_flang", None) 
++        if use_classic_flang and use_classic_flang != "@LLVM_ENABLE_CLASSIC_FLANG@":
++            self.config.flang = self.use_llvm_tool(
++                'flang', search_env='FLANG', required=required)
++            if self.config.flang:
++                tool_substitutions = [
++                    ToolSubst('%flang', command=self.config.flang)
++                    ]
++                self.add_tool_substitutions(tool_substitutions)
+ 
+         self.config.substitutions.append(
+             (
+-- 
+2.24.3 (Apple Git-128)
+
diff --git a/0019-Backport-LoongArch-Improve-the-support-for-atomic-and-clear_cache.patch b/0019-Backport-LoongArch-Improve-the-support-for-atomic-and-clear_cache.patch
new file mode 100644
index 0000000..98f2654
--- /dev/null
+++ b/0019-Backport-LoongArch-Improve-the-support-for-atomic-and-clear_cache.patch
@@ -0,0 +1,12426 @@
+From ad367d826e5959792ce7384be62ba1ccffbf0d9a Mon Sep 17 00:00:00 2001
+From: hev <wangrui@loongson.cn>
+Date: Wed, 11 Oct 2023 10:24:18 +0800
+Subject: [PATCH 1/7] [LoongArch] Improve codegen for atomic ops (#67391)
+
+This PR improves memory barriers generated by atomic operations.
+
+Memory barrier semantics of LL/SC:
+```
+LL: <memory-barrier> + <load-exclusive>
+SC: <store-conditional> + <memory-barrier>
+```
+
+Changes:
+* Remove unnecessary memory barriers before LL and between LL/SC.
+* Fix acquire semantics. (If the SC instruction is not executed, then
+the guarantee of acquiring semantics cannot be ensured. Therefore, an
+acquire barrier needs to be generated when memory ordering includes an
+acquire operation.)
+
+(cherry picked from commit 203ba238e33c570dba6cbcf247f1668bb2a13c26)
+---
+ .../LoongArchExpandAtomicPseudoInsts.cpp      |  50 +--
+ .../Target/LoongArch/LoongArchInstrInfo.td    |  24 +-
+ .../LoongArch/atomicrmw-uinc-udec-wrap.ll     |  24 +-
+ .../ir-instruction/atomic-cmpxchg.ll          | 376 ++++++++++++++++--
+ .../LoongArch/ir-instruction/atomicrmw-fp.ll  |  24 +-
+ .../ir-instruction/atomicrmw-minmax.ll        |  24 --
+ .../LoongArch/ir-instruction/atomicrmw.ll     |  31 --
+ llvm/unittests/Target/LoongArch/InstSizes.cpp |   2 +-
+ 8 files changed, 407 insertions(+), 148 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp
+index 51df0463e235..eb78ef065b21 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp
+@@ -153,18 +153,12 @@ static void doAtomicBinOpExpansion(const LoongArchInstrInfo *TII,
+   Register ScratchReg = MI.getOperand(1).getReg();
+   Register AddrReg = MI.getOperand(2).getReg();
+   Register IncrReg = MI.getOperand(3).getReg();
+-  AtomicOrdering Ordering =
+-      static_cast<AtomicOrdering>(MI.getOperand(4).getImm());
+ 
+   // .loop:
+-  //   if(Ordering != AtomicOrdering::Monotonic)
+-  //     dbar 0
+   //   ll.[w|d] dest, (addr)
+   //   binop scratch, dest, val
+   //   sc.[w|d] scratch, scratch, (addr)
+   //   beqz scratch, loop
+-  if (Ordering != AtomicOrdering::Monotonic)
+-    BuildMI(LoopMBB, DL, TII->get(LoongArch::DBAR)).addImm(0);
+   BuildMI(LoopMBB, DL,
+           TII->get(Width == 32 ? LoongArch::LL_W : LoongArch::LL_D), DestReg)
+       .addReg(AddrReg)
+@@ -251,12 +245,8 @@ static void doMaskedAtomicBinOpExpansion(
+   Register AddrReg = MI.getOperand(2).getReg();
+   Register IncrReg = MI.getOperand(3).getReg();
+   Register MaskReg = MI.getOperand(4).getReg();
+-  AtomicOrdering Ordering =
+-      static_cast<AtomicOrdering>(MI.getOperand(5).getImm());
+ 
+   // .loop:
+-  //   if(Ordering != AtomicOrdering::Monotonic)
+-  //     dbar 0
+   //   ll.w destreg, (alignedaddr)
+   //   binop scratch, destreg, incr
+   //   xor scratch, destreg, scratch
+@@ -264,8 +254,6 @@ static void doMaskedAtomicBinOpExpansion(
+   //   xor scratch, destreg, scratch
+   //   sc.w scratch, scratch, (alignedaddr)
+   //   beqz scratch, loop
+-  if (Ordering != AtomicOrdering::Monotonic)
+-    BuildMI(LoopMBB, DL, TII->get(LoongArch::DBAR)).addImm(0);
+   BuildMI(LoopMBB, DL, TII->get(LoongArch::LL_W), DestReg)
+       .addReg(AddrReg)
+       .addImm(0);
+@@ -372,23 +360,20 @@ bool LoongArchExpandAtomicPseudo::expandAtomicMinMaxOp(
+   auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+   auto LoopIfBodyMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+   auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+-  auto TailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+   auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ 
+   // Insert new MBBs.
+   MF->insert(++MBB.getIterator(), LoopHeadMBB);
+   MF->insert(++LoopHeadMBB->getIterator(), LoopIfBodyMBB);
+   MF->insert(++LoopIfBodyMBB->getIterator(), LoopTailMBB);
+-  MF->insert(++LoopTailMBB->getIterator(), TailMBB);
+-  MF->insert(++TailMBB->getIterator(), DoneMBB);
++  MF->insert(++LoopTailMBB->getIterator(), DoneMBB);
+ 
+   // Set up successors and transfer remaining instructions to DoneMBB.
+   LoopHeadMBB->addSuccessor(LoopIfBodyMBB);
+   LoopHeadMBB->addSuccessor(LoopTailMBB);
+   LoopIfBodyMBB->addSuccessor(LoopTailMBB);
+   LoopTailMBB->addSuccessor(LoopHeadMBB);
+-  LoopTailMBB->addSuccessor(TailMBB);
+-  TailMBB->addSuccessor(DoneMBB);
++  LoopTailMBB->addSuccessor(DoneMBB);
+   DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end());
+   DoneMBB->transferSuccessors(&MBB);
+   MBB.addSuccessor(LoopHeadMBB);
+@@ -402,11 +387,9 @@ bool LoongArchExpandAtomicPseudo::expandAtomicMinMaxOp(
+ 
+   //
+   // .loophead:
+-  //   dbar 0
+   //   ll.w destreg, (alignedaddr)
+   //   and scratch2, destreg, mask
+   //   move scratch1, destreg
+-  BuildMI(LoopHeadMBB, DL, TII->get(LoongArch::DBAR)).addImm(0);
+   BuildMI(LoopHeadMBB, DL, TII->get(LoongArch::LL_W), DestReg)
+       .addReg(AddrReg)
+       .addImm(0);
+@@ -463,7 +446,6 @@ bool LoongArchExpandAtomicPseudo::expandAtomicMinMaxOp(
+   // .looptail:
+   //   sc.w scratch1, scratch1, (addr)
+   //   beqz scratch1, loop
+-  //   dbar 0x700
+   BuildMI(LoopTailMBB, DL, TII->get(LoongArch::SC_W), Scratch1Reg)
+       .addReg(Scratch1Reg)
+       .addReg(AddrReg)
+@@ -472,10 +454,6 @@ bool LoongArchExpandAtomicPseudo::expandAtomicMinMaxOp(
+       .addReg(Scratch1Reg)
+       .addMBB(LoopHeadMBB);
+ 
+-  // .tail:
+-  //   dbar 0x700
+-  BuildMI(TailMBB, DL, TII->get(LoongArch::DBAR)).addImm(0x700);
+-
+   NextMBBI = MBB.end();
+   MI.eraseFromParent();
+ 
+@@ -483,7 +461,6 @@ bool LoongArchExpandAtomicPseudo::expandAtomicMinMaxOp(
+   computeAndAddLiveIns(LiveRegs, *LoopHeadMBB);
+   computeAndAddLiveIns(LiveRegs, *LoopIfBodyMBB);
+   computeAndAddLiveIns(LiveRegs, *LoopTailMBB);
+-  computeAndAddLiveIns(LiveRegs, *TailMBB);
+   computeAndAddLiveIns(LiveRegs, *DoneMBB);
+ 
+   return true;
+@@ -535,12 +512,10 @@ bool LoongArchExpandAtomicPseudo::expandAtomicCmpXchg(
+         .addReg(CmpValReg)
+         .addMBB(TailMBB);
+     // .looptail:
+-    //   dbar 0
+     //   move scratch, newval
+     //   sc.[w|d] scratch, scratch, (addr)
+     //   beqz scratch, loophead
+     //   b done
+-    BuildMI(LoopTailMBB, DL, TII->get(LoongArch::DBAR)).addImm(0);
+     BuildMI(LoopTailMBB, DL, TII->get(LoongArch::OR), ScratchReg)
+         .addReg(NewValReg)
+         .addReg(LoongArch::R0);
+@@ -573,13 +548,11 @@ bool LoongArchExpandAtomicPseudo::expandAtomicCmpXchg(
+         .addMBB(TailMBB);
+ 
+     // .looptail:
+-    //   dbar 0
+     //   andn scratch, dest, mask
+     //   or scratch, scratch, newval
+     //   sc.[w|d] scratch, scratch, (addr)
+     //   beqz scratch, loophead
+     //   b done
+-    BuildMI(LoopTailMBB, DL, TII->get(LoongArch::DBAR)).addImm(0);
+     BuildMI(LoopTailMBB, DL, TII->get(LoongArch::ANDN), ScratchReg)
+         .addReg(DestReg)
+         .addReg(MaskReg);
+@@ -598,9 +571,24 @@ bool LoongArchExpandAtomicPseudo::expandAtomicCmpXchg(
+     BuildMI(LoopTailMBB, DL, TII->get(LoongArch::B)).addMBB(DoneMBB);
+   }
+ 
++  AtomicOrdering Ordering =
++      static_cast<AtomicOrdering>(MI.getOperand(IsMasked ? 6 : 5).getImm());
++  int hint;
++
++  switch (Ordering) {
++  case AtomicOrdering::Acquire:
++  case AtomicOrdering::AcquireRelease:
++  case AtomicOrdering::SequentiallyConsistent:
++    // TODO: acquire
++    hint = 0;
++    break;
++  default:
++    hint = 0x700;
++  }
++
+   // .tail:
+-  //   dbar 0x700
+-  BuildMI(TailMBB, DL, TII->get(LoongArch::DBAR)).addImm(0x700);
++  //   dbar 0x700 | acquire
++  BuildMI(TailMBB, DL, TII->get(LoongArch::DBAR)).addImm(hint);
+ 
+   NextMBBI = MBB.end();
+   MI.eraseFromParent();
+diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+index 05ae36a9781d..a9b0db30c2f6 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+@@ -1731,7 +1731,7 @@ def PseudoMaskedAtomicLoadMin32 : PseudoMaskedAMMinMax;
+ 
+ class PseudoCmpXchg
+     : Pseudo<(outs GPR:$res, GPR:$scratch),
+-             (ins GPR:$addr, GPR:$cmpval, GPR:$newval)> {
++             (ins GPR:$addr, GPR:$cmpval, GPR:$newval, grlenimm:$ordering)> {
+   let Constraints = "@earlyclobber $res,@earlyclobber $scratch";
+   let mayLoad = 1;
+   let mayStore = 1;
+@@ -1821,14 +1821,28 @@ def : AtomicPat<int_loongarch_masked_atomicrmw_umax_i64,
+ def : AtomicPat<int_loongarch_masked_atomicrmw_umin_i64,
+                 PseudoMaskedAtomicLoadUMin32>;
+ 
+-def : Pat<(atomic_cmp_swap_64 GPR:$addr, GPR:$cmp, GPR:$new),
+-          (PseudoCmpXchg64 GPR:$addr, GPR:$cmp, GPR:$new)>;
++// Ordering constants must be kept in sync with the AtomicOrdering enum in
++// AtomicOrdering.h.
++multiclass PseudoCmpXchgPat<string Op, Pseudo CmpXchgInst,
++                            ValueType vt = GRLenVT> {
++  def : Pat<(vt (!cast<PatFrag>(Op#"_monotonic") GPR:$addr, GPR:$cmp, GPR:$new)),
++            (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 2)>;
++  def : Pat<(vt (!cast<PatFrag>(Op#"_acquire") GPR:$addr, GPR:$cmp, GPR:$new)),
++            (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 4)>;
++  def : Pat<(vt (!cast<PatFrag>(Op#"_release") GPR:$addr, GPR:$cmp, GPR:$new)),
++            (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 5)>;
++  def : Pat<(vt (!cast<PatFrag>(Op#"_acq_rel") GPR:$addr, GPR:$cmp, GPR:$new)),
++            (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 6)>;
++  def : Pat<(vt (!cast<PatFrag>(Op#"_seq_cst") GPR:$addr, GPR:$cmp, GPR:$new)),
++            (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 7)>;
++}
++
++defm : PseudoCmpXchgPat<"atomic_cmp_swap_32", PseudoCmpXchg32>;
++defm : PseudoCmpXchgPat<"atomic_cmp_swap_64", PseudoCmpXchg64, i64>;
+ def : Pat<(int_loongarch_masked_cmpxchg_i64
+             GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering),
+           (PseudoMaskedCmpXchg32
+             GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering)>;
+-def : Pat<(atomic_cmp_swap_32 GPR:$addr, GPR:$cmp, GPR:$new),
+-          (PseudoCmpXchg32 GPR:$addr, GPR:$cmp, GPR:$new)>;
+ 
+ def : PseudoMaskedAMMinMaxPat<int_loongarch_masked_atomicrmw_max_i64,
+                               PseudoMaskedAtomicLoadMax32>;
+diff --git a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
+index f11af8fe6528..32106886c783 100644
+--- a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
++++ b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
+@@ -34,14 +34,13 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
+ ; LA64-NEXT:    bne $a5, $a3, .LBB0_5
+ ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB0_3 Depth=2
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    move $a7, $a6
+ ; LA64-NEXT:    sc.w $a7, $a2, 0
+ ; LA64-NEXT:    beqz $a7, .LBB0_3
+ ; LA64-NEXT:    b .LBB0_6
+ ; LA64-NEXT:  .LBB0_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB0_1 Depth=1
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB0_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB0_1 Depth=1
+ ; LA64-NEXT:    addi.w $a6, $a3, 0
+@@ -88,14 +87,13 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
+ ; LA64-NEXT:    bne $a5, $a3, .LBB1_5
+ ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB1_3 Depth=2
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    move $a7, $a6
+ ; LA64-NEXT:    sc.w $a7, $a2, 0
+ ; LA64-NEXT:    beqz $a7, .LBB1_3
+ ; LA64-NEXT:    b .LBB1_6
+ ; LA64-NEXT:  .LBB1_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB1_1 Depth=1
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB1_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB1_1 Depth=1
+ ; LA64-NEXT:    addi.w $a6, $a3, 0
+@@ -129,14 +127,13 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
+ ; LA64-NEXT:    bne $a1, $a3, .LBB2_5
+ ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB2_3 Depth=2
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    move $a6, $a5
+ ; LA64-NEXT:    sc.w $a6, $a0, 0
+ ; LA64-NEXT:    beqz $a6, .LBB2_3
+ ; LA64-NEXT:    b .LBB2_6
+ ; LA64-NEXT:  .LBB2_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB2_1 Depth=1
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB2_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB2_1 Depth=1
+ ; LA64-NEXT:    move $a3, $a1
+@@ -168,14 +165,13 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
+ ; LA64-NEXT:    bne $a2, $a3, .LBB3_5
+ ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB3_3 Depth=2
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    move $a5, $a4
+ ; LA64-NEXT:    sc.d $a5, $a0, 0
+ ; LA64-NEXT:    beqz $a5, .LBB3_3
+ ; LA64-NEXT:    b .LBB3_6
+ ; LA64-NEXT:  .LBB3_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB3_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+ ; LA64-NEXT:    bne $a2, $a3, .LBB3_1
+@@ -224,14 +220,13 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
+ ; LA64-NEXT:    bne $a6, $a3, .LBB4_5
+ ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB4_3 Depth=2
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    move $t0, $a7
+ ; LA64-NEXT:    sc.w $t0, $a2, 0
+ ; LA64-NEXT:    beqz $t0, .LBB4_3
+ ; LA64-NEXT:    b .LBB4_6
+ ; LA64-NEXT:  .LBB4_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB4_1 Depth=1
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB4_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB4_1 Depth=1
+ ; LA64-NEXT:    addi.w $a7, $a3, 0
+@@ -283,14 +278,13 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
+ ; LA64-NEXT:    bne $a6, $a3, .LBB5_5
+ ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB5_3 Depth=2
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    move $t0, $a7
+ ; LA64-NEXT:    sc.w $t0, $a2, 0
+ ; LA64-NEXT:    beqz $t0, .LBB5_3
+ ; LA64-NEXT:    b .LBB5_6
+ ; LA64-NEXT:  .LBB5_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB5_1 Depth=1
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB5_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB5_1 Depth=1
+ ; LA64-NEXT:    addi.w $a7, $a3, 0
+@@ -329,14 +323,13 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
+ ; LA64-NEXT:    bne $a2, $a4, .LBB6_5
+ ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB6_3 Depth=2
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    move $a7, $a6
+ ; LA64-NEXT:    sc.w $a7, $a0, 0
+ ; LA64-NEXT:    beqz $a7, .LBB6_3
+ ; LA64-NEXT:    b .LBB6_6
+ ; LA64-NEXT:  .LBB6_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB6_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+ ; LA64-NEXT:    move $a4, $a2
+@@ -373,14 +366,13 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
+ ; LA64-NEXT:    bne $a2, $a3, .LBB7_5
+ ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB7_3 Depth=2
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    move $a5, $a4
+ ; LA64-NEXT:    sc.d $a5, $a0, 0
+ ; LA64-NEXT:    beqz $a5, .LBB7_3
+ ; LA64-NEXT:    b .LBB7_6
+ ; LA64-NEXT:  .LBB7_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB7_1 Depth=1
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB7_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB7_1 Depth=1
+ ; LA64-NEXT:    bne $a2, $a3, .LBB7_1
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
+index 76e51fe7d3e8..1ac20d10e587 100644
+--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
+@@ -21,14 +21,13 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
+ ; LA64-NEXT:    and $a5, $a4, $a0
+ ; LA64-NEXT:    bne $a5, $a1, .LBB0_3
+ ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB0_1 Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    andn $a5, $a4, $a0
+ ; LA64-NEXT:    or $a5, $a5, $a2
+ ; LA64-NEXT:    sc.w $a5, $a3, 0
+ ; LA64-NEXT:    beqz $a5, .LBB0_1
+ ; LA64-NEXT:    b .LBB0_4
+ ; LA64-NEXT:  .LBB0_3:
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB0_4:
+ ; LA64-NEXT:    ret
+   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val acquire acquire
+@@ -56,14 +55,13 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
+ ; LA64-NEXT:    and $a5, $a4, $a0
+ ; LA64-NEXT:    bne $a5, $a1, .LBB1_3
+ ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB1_1 Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    andn $a5, $a4, $a0
+ ; LA64-NEXT:    or $a5, $a5, $a2
+ ; LA64-NEXT:    sc.w $a5, $a3, 0
+ ; LA64-NEXT:    beqz $a5, .LBB1_1
+ ; LA64-NEXT:    b .LBB1_4
+ ; LA64-NEXT:  .LBB1_3:
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB1_4:
+ ; LA64-NEXT:    ret
+   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val acquire acquire
+@@ -77,13 +75,12 @@ define void @cmpxchg_i32_acquire_acquire(ptr %ptr, i32 %cmp, i32 %val) nounwind
+ ; LA64-NEXT:    ll.w $a3, $a0, 0
+ ; LA64-NEXT:    bne $a3, $a1, .LBB2_3
+ ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB2_1 Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.w $a4, $a0, 0
+ ; LA64-NEXT:    beqz $a4, .LBB2_1
+ ; LA64-NEXT:    b .LBB2_4
+ ; LA64-NEXT:  .LBB2_3:
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB2_4:
+ ; LA64-NEXT:    ret
+   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire acquire
+@@ -97,13 +94,12 @@ define void @cmpxchg_i64_acquire_acquire(ptr %ptr, i64 %cmp, i64 %val) nounwind
+ ; LA64-NEXT:    ll.d $a3, $a0, 0
+ ; LA64-NEXT:    bne $a3, $a1, .LBB3_3
+ ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB3_1 Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.d $a4, $a0, 0
+ ; LA64-NEXT:    beqz $a4, .LBB3_1
+ ; LA64-NEXT:    b .LBB3_4
+ ; LA64-NEXT:  .LBB3_3:
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB3_4:
+ ; LA64-NEXT:    ret
+   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val acquire acquire
+@@ -130,14 +126,13 @@ define i8 @cmpxchg_i8_acquire_acquire_reti8(ptr %ptr, i8 %cmp, i8 %val) nounwind
+ ; LA64-NEXT:    and $a6, $a5, $a4
+ ; LA64-NEXT:    bne $a6, $a1, .LBB4_3
+ ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB4_1 Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    andn $a6, $a5, $a4
+ ; LA64-NEXT:    or $a6, $a6, $a2
+ ; LA64-NEXT:    sc.w $a6, $a3, 0
+ ; LA64-NEXT:    beqz $a6, .LBB4_1
+ ; LA64-NEXT:    b .LBB4_4
+ ; LA64-NEXT:  .LBB4_3:
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB4_4:
+ ; LA64-NEXT:    srl.w $a0, $a5, $a0
+ ; LA64-NEXT:    ret
+@@ -167,14 +162,13 @@ define i16 @cmpxchg_i16_acquire_acquire_reti16(ptr %ptr, i16 %cmp, i16 %val) nou
+ ; LA64-NEXT:    and $a6, $a5, $a4
+ ; LA64-NEXT:    bne $a6, $a1, .LBB5_3
+ ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB5_1 Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    andn $a6, $a5, $a4
+ ; LA64-NEXT:    or $a6, $a6, $a2
+ ; LA64-NEXT:    sc.w $a6, $a3, 0
+ ; LA64-NEXT:    beqz $a6, .LBB5_1
+ ; LA64-NEXT:    b .LBB5_4
+ ; LA64-NEXT:  .LBB5_3:
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB5_4:
+ ; LA64-NEXT:    srl.w $a0, $a5, $a0
+ ; LA64-NEXT:    ret
+@@ -190,13 +184,12 @@ define i32 @cmpxchg_i32_acquire_acquire_reti32(ptr %ptr, i32 %cmp, i32 %val) nou
+ ; LA64-NEXT:    ll.w $a3, $a0, 0
+ ; LA64-NEXT:    bne $a3, $a1, .LBB6_3
+ ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB6_1 Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.w $a4, $a0, 0
+ ; LA64-NEXT:    beqz $a4, .LBB6_1
+ ; LA64-NEXT:    b .LBB6_4
+ ; LA64-NEXT:  .LBB6_3:
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB6_4:
+ ; LA64-NEXT:    move $a0, $a3
+ ; LA64-NEXT:    ret
+@@ -212,13 +205,12 @@ define i64 @cmpxchg_i64_acquire_acquire_reti64(ptr %ptr, i64 %cmp, i64 %val) nou
+ ; LA64-NEXT:    ll.d $a3, $a0, 0
+ ; LA64-NEXT:    bne $a3, $a1, .LBB7_3
+ ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB7_1 Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.d $a4, $a0, 0
+ ; LA64-NEXT:    beqz $a4, .LBB7_1
+ ; LA64-NEXT:    b .LBB7_4
+ ; LA64-NEXT:  .LBB7_3:
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB7_4:
+ ; LA64-NEXT:    move $a0, $a3
+ ; LA64-NEXT:    ret
+@@ -247,14 +239,13 @@ define i1 @cmpxchg_i8_acquire_acquire_reti1(ptr %ptr, i8 %cmp, i8 %val) nounwind
+ ; LA64-NEXT:    and $a6, $a5, $a2
+ ; LA64-NEXT:    bne $a6, $a1, .LBB8_3
+ ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB8_1 Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    andn $a6, $a5, $a2
+ ; LA64-NEXT:    or $a6, $a6, $a0
+ ; LA64-NEXT:    sc.w $a6, $a3, 0
+ ; LA64-NEXT:    beqz $a6, .LBB8_1
+ ; LA64-NEXT:    b .LBB8_4
+ ; LA64-NEXT:  .LBB8_3:
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB8_4:
+ ; LA64-NEXT:    and $a0, $a5, $a4
+ ; LA64-NEXT:    addi.w $a0, $a0, 0
+@@ -287,14 +278,13 @@ define i1 @cmpxchg_i16_acquire_acquire_reti1(ptr %ptr, i16 %cmp, i16 %val) nounw
+ ; LA64-NEXT:    and $a6, $a5, $a2
+ ; LA64-NEXT:    bne $a6, $a1, .LBB9_3
+ ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB9_1 Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    andn $a6, $a5, $a2
+ ; LA64-NEXT:    or $a6, $a6, $a0
+ ; LA64-NEXT:    sc.w $a6, $a3, 0
+ ; LA64-NEXT:    beqz $a6, .LBB9_1
+ ; LA64-NEXT:    b .LBB9_4
+ ; LA64-NEXT:  .LBB9_3:
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB9_4:
+ ; LA64-NEXT:    and $a0, $a5, $a4
+ ; LA64-NEXT:    addi.w $a0, $a0, 0
+@@ -313,13 +303,12 @@ define i1 @cmpxchg_i32_acquire_acquire_reti1(ptr %ptr, i32 %cmp, i32 %val) nounw
+ ; LA64-NEXT:    ll.w $a3, $a0, 0
+ ; LA64-NEXT:    bne $a3, $a1, .LBB10_3
+ ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB10_1 Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.w $a4, $a0, 0
+ ; LA64-NEXT:    beqz $a4, .LBB10_1
+ ; LA64-NEXT:    b .LBB10_4
+ ; LA64-NEXT:  .LBB10_3:
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB10_4:
+ ; LA64-NEXT:    addi.w $a0, $a1, 0
+ ; LA64-NEXT:    xor $a0, $a3, $a0
+@@ -337,13 +326,12 @@ define i1 @cmpxchg_i64_acquire_acquire_reti1(ptr %ptr, i64 %cmp, i64 %val) nounw
+ ; LA64-NEXT:    ll.d $a3, $a0, 0
+ ; LA64-NEXT:    bne $a3, $a1, .LBB11_3
+ ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB11_1 Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.d $a4, $a0, 0
+ ; LA64-NEXT:    beqz $a4, .LBB11_1
+ ; LA64-NEXT:    b .LBB11_4
+ ; LA64-NEXT:  .LBB11_3:
+-; LA64-NEXT:    dbar 1792
++; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:  .LBB11_4:
+ ; LA64-NEXT:    xor $a0, $a3, $a1
+ ; LA64-NEXT:    sltui $a0, $a0, 1
+@@ -352,3 +340,343 @@ define i1 @cmpxchg_i64_acquire_acquire_reti1(ptr %ptr, i64 %cmp, i64 %val) nounw
+   %res = extractvalue { i64, i1 } %tmp, 1
+   ret i1 %res
+ }
++
++define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
++; LA64-LABEL: cmpxchg_i8_monotonic_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a3, $zero, -4
++; LA64-NEXT:    and $a3, $a0, $a3
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    andi $a2, $a2, 255
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    ori $a4, $zero, 255
++; LA64-NEXT:    sll.w $a0, $a4, $a0
++; LA64-NEXT:    addi.w $a0, $a0, 0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a3, 0
++; LA64-NEXT:    and $a5, $a4, $a0
++; LA64-NEXT:    bne $a5, $a1, .LBB12_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB12_1 Depth=1
++; LA64-NEXT:    andn $a5, $a4, $a0
++; LA64-NEXT:    or $a5, $a5, $a2
++; LA64-NEXT:    sc.w $a5, $a3, 0
++; LA64-NEXT:    beqz $a5, .LBB12_1
++; LA64-NEXT:    b .LBB12_4
++; LA64-NEXT:  .LBB12_3:
++; LA64-NEXT:    dbar 1792
++; LA64-NEXT:  .LBB12_4:
++; LA64-NEXT:    ret
++  %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val monotonic monotonic
++  ret void
++}
++
++define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwind {
++; LA64-LABEL: cmpxchg_i16_monotonic_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a3, $zero, -4
++; LA64-NEXT:    and $a3, $a0, $a3
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    bstrpick.d $a2, $a2, 15, 0
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    lu12i.w $a4, 15
++; LA64-NEXT:    ori $a4, $a4, 4095
++; LA64-NEXT:    sll.w $a0, $a4, $a0
++; LA64-NEXT:    addi.w $a0, $a0, 0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a3, 0
++; LA64-NEXT:    and $a5, $a4, $a0
++; LA64-NEXT:    bne $a5, $a1, .LBB13_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB13_1 Depth=1
++; LA64-NEXT:    andn $a5, $a4, $a0
++; LA64-NEXT:    or $a5, $a5, $a2
++; LA64-NEXT:    sc.w $a5, $a3, 0
++; LA64-NEXT:    beqz $a5, .LBB13_1
++; LA64-NEXT:    b .LBB13_4
++; LA64-NEXT:  .LBB13_3:
++; LA64-NEXT:    dbar 1792
++; LA64-NEXT:  .LBB13_4:
++; LA64-NEXT:    ret
++  %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val monotonic monotonic
++  ret void
++}
++
++define void @cmpxchg_i32_monotonic_monotonic(ptr %ptr, i32 %cmp, i32 %val) nounwind {
++; LA64-LABEL: cmpxchg_i32_monotonic_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a3, $a0, 0
++; LA64-NEXT:    bne $a3, $a1, .LBB14_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB14_1 Depth=1
++; LA64-NEXT:    move $a4, $a2
++; LA64-NEXT:    sc.w $a4, $a0, 0
++; LA64-NEXT:    beqz $a4, .LBB14_1
++; LA64-NEXT:    b .LBB14_4
++; LA64-NEXT:  .LBB14_3:
++; LA64-NEXT:    dbar 1792
++; LA64-NEXT:  .LBB14_4:
++; LA64-NEXT:    ret
++  %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val monotonic monotonic
++  ret void
++}
++
++define void @cmpxchg_i64_monotonic_monotonic(ptr %ptr, i64 %cmp, i64 %val) nounwind {
++; LA64-LABEL: cmpxchg_i64_monotonic_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.d $a3, $a0, 0
++; LA64-NEXT:    bne $a3, $a1, .LBB15_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB15_1 Depth=1
++; LA64-NEXT:    move $a4, $a2
++; LA64-NEXT:    sc.d $a4, $a0, 0
++; LA64-NEXT:    beqz $a4, .LBB15_1
++; LA64-NEXT:    b .LBB15_4
++; LA64-NEXT:  .LBB15_3:
++; LA64-NEXT:    dbar 1792
++; LA64-NEXT:  .LBB15_4:
++; LA64-NEXT:    ret
++  %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val monotonic monotonic
++  ret void
++}
++
++define i8 @cmpxchg_i8_monotonic_monotonic_reti8(ptr %ptr, i8 %cmp, i8 %val) nounwind {
++; LA64-LABEL: cmpxchg_i8_monotonic_monotonic_reti8:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a3, $zero, -4
++; LA64-NEXT:    and $a3, $a0, $a3
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a4, $zero, 255
++; LA64-NEXT:    sll.w $a4, $a4, $a0
++; LA64-NEXT:    addi.w $a4, $a4, 0
++; LA64-NEXT:    andi $a2, $a2, 255
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a3, 0
++; LA64-NEXT:    and $a6, $a5, $a4
++; LA64-NEXT:    bne $a6, $a1, .LBB16_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB16_1 Depth=1
++; LA64-NEXT:    andn $a6, $a5, $a4
++; LA64-NEXT:    or $a6, $a6, $a2
++; LA64-NEXT:    sc.w $a6, $a3, 0
++; LA64-NEXT:    beqz $a6, .LBB16_1
++; LA64-NEXT:    b .LBB16_4
++; LA64-NEXT:  .LBB16_3:
++; LA64-NEXT:    dbar 1792
++; LA64-NEXT:  .LBB16_4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %tmp = cmpxchg ptr %ptr, i8 %cmp, i8 %val monotonic monotonic
++  %res = extractvalue { i8, i1 } %tmp, 0
++  ret i8 %res
++}
++
++define i16 @cmpxchg_i16_monotonic_monotonic_reti16(ptr %ptr, i16 %cmp, i16 %val) nounwind {
++; LA64-LABEL: cmpxchg_i16_monotonic_monotonic_reti16:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a3, $zero, -4
++; LA64-NEXT:    and $a3, $a0, $a3
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a4, 15
++; LA64-NEXT:    ori $a4, $a4, 4095
++; LA64-NEXT:    sll.w $a4, $a4, $a0
++; LA64-NEXT:    addi.w $a4, $a4, 0
++; LA64-NEXT:    bstrpick.d $a2, $a2, 15, 0
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a3, 0
++; LA64-NEXT:    and $a6, $a5, $a4
++; LA64-NEXT:    bne $a6, $a1, .LBB17_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB17_1 Depth=1
++; LA64-NEXT:    andn $a6, $a5, $a4
++; LA64-NEXT:    or $a6, $a6, $a2
++; LA64-NEXT:    sc.w $a6, $a3, 0
++; LA64-NEXT:    beqz $a6, .LBB17_1
++; LA64-NEXT:    b .LBB17_4
++; LA64-NEXT:  .LBB17_3:
++; LA64-NEXT:    dbar 1792
++; LA64-NEXT:  .LBB17_4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %tmp = cmpxchg ptr %ptr, i16 %cmp, i16 %val monotonic monotonic
++  %res = extractvalue { i16, i1 } %tmp, 0
++  ret i16 %res
++}
++
++define i32 @cmpxchg_i32_monotonic_monotonic_reti32(ptr %ptr, i32 %cmp, i32 %val) nounwind {
++; LA64-LABEL: cmpxchg_i32_monotonic_monotonic_reti32:
++; LA64:       # %bb.0:
++; LA64-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a3, $a0, 0
++; LA64-NEXT:    bne $a3, $a1, .LBB18_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB18_1 Depth=1
++; LA64-NEXT:    move $a4, $a2
++; LA64-NEXT:    sc.w $a4, $a0, 0
++; LA64-NEXT:    beqz $a4, .LBB18_1
++; LA64-NEXT:    b .LBB18_4
++; LA64-NEXT:  .LBB18_3:
++; LA64-NEXT:    dbar 1792
++; LA64-NEXT:  .LBB18_4:
++; LA64-NEXT:    move $a0, $a3
++; LA64-NEXT:    ret
++  %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val monotonic monotonic
++  %res = extractvalue { i32, i1 } %tmp, 0
++  ret i32 %res
++}
++
++define i64 @cmpxchg_i64_monotonic_monotonic_reti64(ptr %ptr, i64 %cmp, i64 %val) nounwind {
++; LA64-LABEL: cmpxchg_i64_monotonic_monotonic_reti64:
++; LA64:       # %bb.0:
++; LA64-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.d $a3, $a0, 0
++; LA64-NEXT:    bne $a3, $a1, .LBB19_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB19_1 Depth=1
++; LA64-NEXT:    move $a4, $a2
++; LA64-NEXT:    sc.d $a4, $a0, 0
++; LA64-NEXT:    beqz $a4, .LBB19_1
++; LA64-NEXT:    b .LBB19_4
++; LA64-NEXT:  .LBB19_3:
++; LA64-NEXT:    dbar 1792
++; LA64-NEXT:  .LBB19_4:
++; LA64-NEXT:    move $a0, $a3
++; LA64-NEXT:    ret
++  %tmp = cmpxchg ptr %ptr, i64 %cmp, i64 %val monotonic monotonic
++  %res = extractvalue { i64, i1 } %tmp, 0
++  ret i64 %res
++}
++
++define i1 @cmpxchg_i8_monotonic_monotonic_reti1(ptr %ptr, i8 %cmp, i8 %val) nounwind {
++; LA64-LABEL: cmpxchg_i8_monotonic_monotonic_reti1:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a3, $zero, -4
++; LA64-NEXT:    and $a3, $a0, $a3
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a4, $zero, 255
++; LA64-NEXT:    sll.w $a4, $a4, $a0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    andi $a2, $a2, 255
++; LA64-NEXT:    sll.w $a0, $a2, $a0
++; LA64-NEXT:    addi.w $a0, $a0, 0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:    addi.w $a2, $a4, 0
++; LA64-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a3, 0
++; LA64-NEXT:    and $a6, $a5, $a2
++; LA64-NEXT:    bne $a6, $a1, .LBB20_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB20_1 Depth=1
++; LA64-NEXT:    andn $a6, $a5, $a2
++; LA64-NEXT:    or $a6, $a6, $a0
++; LA64-NEXT:    sc.w $a6, $a3, 0
++; LA64-NEXT:    beqz $a6, .LBB20_1
++; LA64-NEXT:    b .LBB20_4
++; LA64-NEXT:  .LBB20_3:
++; LA64-NEXT:    dbar 1792
++; LA64-NEXT:  .LBB20_4:
++; LA64-NEXT:    and $a0, $a5, $a4
++; LA64-NEXT:    addi.w $a0, $a0, 0
++; LA64-NEXT:    xor $a0, $a1, $a0
++; LA64-NEXT:    sltui $a0, $a0, 1
++; LA64-NEXT:    ret
++  %tmp = cmpxchg ptr %ptr, i8 %cmp, i8 %val monotonic monotonic
++  %res = extractvalue { i8, i1 } %tmp, 1
++  ret i1 %res
++}
++
++define i1 @cmpxchg_i16_monotonic_monotonic_reti1(ptr %ptr, i16 %cmp, i16 %val) nounwind {
++; LA64-LABEL: cmpxchg_i16_monotonic_monotonic_reti1:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a3, $zero, -4
++; LA64-NEXT:    and $a3, $a0, $a3
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a4, 15
++; LA64-NEXT:    ori $a4, $a4, 4095
++; LA64-NEXT:    sll.w $a4, $a4, $a0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    bstrpick.d $a2, $a2, 15, 0
++; LA64-NEXT:    sll.w $a0, $a2, $a0
++; LA64-NEXT:    addi.w $a0, $a0, 0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:    addi.w $a2, $a4, 0
++; LA64-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a3, 0
++; LA64-NEXT:    and $a6, $a5, $a2
++; LA64-NEXT:    bne $a6, $a1, .LBB21_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB21_1 Depth=1
++; LA64-NEXT:    andn $a6, $a5, $a2
++; LA64-NEXT:    or $a6, $a6, $a0
++; LA64-NEXT:    sc.w $a6, $a3, 0
++; LA64-NEXT:    beqz $a6, .LBB21_1
++; LA64-NEXT:    b .LBB21_4
++; LA64-NEXT:  .LBB21_3:
++; LA64-NEXT:    dbar 1792
++; LA64-NEXT:  .LBB21_4:
++; LA64-NEXT:    and $a0, $a5, $a4
++; LA64-NEXT:    addi.w $a0, $a0, 0
++; LA64-NEXT:    xor $a0, $a1, $a0
++; LA64-NEXT:    sltui $a0, $a0, 1
++; LA64-NEXT:    ret
++  %tmp = cmpxchg ptr %ptr, i16 %cmp, i16 %val monotonic monotonic
++  %res = extractvalue { i16, i1 } %tmp, 1
++  ret i1 %res
++}
++
++define i1 @cmpxchg_i32_monotonic_monotonic_reti1(ptr %ptr, i32 %cmp, i32 %val) nounwind {
++; LA64-LABEL: cmpxchg_i32_monotonic_monotonic_reti1:
++; LA64:       # %bb.0:
++; LA64-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a3, $a0, 0
++; LA64-NEXT:    bne $a3, $a1, .LBB22_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB22_1 Depth=1
++; LA64-NEXT:    move $a4, $a2
++; LA64-NEXT:    sc.w $a4, $a0, 0
++; LA64-NEXT:    beqz $a4, .LBB22_1
++; LA64-NEXT:    b .LBB22_4
++; LA64-NEXT:  .LBB22_3:
++; LA64-NEXT:    dbar 1792
++; LA64-NEXT:  .LBB22_4:
++; LA64-NEXT:    addi.w $a0, $a1, 0
++; LA64-NEXT:    xor $a0, $a3, $a0
++; LA64-NEXT:    sltui $a0, $a0, 1
++; LA64-NEXT:    ret
++  %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val monotonic monotonic
++  %res = extractvalue { i32, i1 } %tmp, 1
++  ret i1 %res
++}
++
++define i1 @cmpxchg_i64_monotonic_monotonic_reti1(ptr %ptr, i64 %cmp, i64 %val) nounwind {
++; LA64-LABEL: cmpxchg_i64_monotonic_monotonic_reti1:
++; LA64:       # %bb.0:
++; LA64-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.d $a3, $a0, 0
++; LA64-NEXT:    bne $a3, $a1, .LBB23_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB23_1 Depth=1
++; LA64-NEXT:    move $a4, $a2
++; LA64-NEXT:    sc.d $a4, $a0, 0
++; LA64-NEXT:    beqz $a4, .LBB23_1
++; LA64-NEXT:    b .LBB23_4
++; LA64-NEXT:  .LBB23_3:
++; LA64-NEXT:    dbar 1792
++; LA64-NEXT:  .LBB23_4:
++; LA64-NEXT:    xor $a0, $a3, $a1
++; LA64-NEXT:    sltui $a0, $a0, 1
++; LA64-NEXT:    ret
++  %tmp = cmpxchg ptr %ptr, i64 %cmp, i64 %val monotonic monotonic
++  %res = extractvalue { i64, i1 } %tmp, 1
++  ret i1 %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+index 9767717395b6..9a29d67e9982 100644
+--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+@@ -25,14 +25,13 @@ define float @float_fadd_acquire(ptr %p) nounwind {
+ ; LA64F-NEXT:    bne $a3, $a2, .LBB0_5
+ ; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB0_3 Depth=2
+-; LA64F-NEXT:    dbar 0
+ ; LA64F-NEXT:    move $a4, $a1
+ ; LA64F-NEXT:    sc.w $a4, $a0, 0
+ ; LA64F-NEXT:    beqz $a4, .LBB0_3
+ ; LA64F-NEXT:    b .LBB0_6
+ ; LA64F-NEXT:  .LBB0_5: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB0_1 Depth=1
+-; LA64F-NEXT:    dbar 1792
++; LA64F-NEXT:    dbar 0
+ ; LA64F-NEXT:  .LBB0_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB0_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+@@ -61,14 +60,13 @@ define float @float_fadd_acquire(ptr %p) nounwind {
+ ; LA64D-NEXT:    bne $a3, $a2, .LBB0_5
+ ; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB0_3 Depth=2
+-; LA64D-NEXT:    dbar 0
+ ; LA64D-NEXT:    move $a4, $a1
+ ; LA64D-NEXT:    sc.w $a4, $a0, 0
+ ; LA64D-NEXT:    beqz $a4, .LBB0_3
+ ; LA64D-NEXT:    b .LBB0_6
+ ; LA64D-NEXT:  .LBB0_5: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB0_1 Depth=1
+-; LA64D-NEXT:    dbar 1792
++; LA64D-NEXT:    dbar 0
+ ; LA64D-NEXT:  .LBB0_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB0_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+@@ -101,14 +99,13 @@ define float @float_fsub_acquire(ptr %p) nounwind {
+ ; LA64F-NEXT:    bne $a3, $a2, .LBB1_5
+ ; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB1_3 Depth=2
+-; LA64F-NEXT:    dbar 0
+ ; LA64F-NEXT:    move $a4, $a1
+ ; LA64F-NEXT:    sc.w $a4, $a0, 0
+ ; LA64F-NEXT:    beqz $a4, .LBB1_3
+ ; LA64F-NEXT:    b .LBB1_6
+ ; LA64F-NEXT:  .LBB1_5: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB1_1 Depth=1
+-; LA64F-NEXT:    dbar 1792
++; LA64F-NEXT:    dbar 0
+ ; LA64F-NEXT:  .LBB1_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB1_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+@@ -137,14 +134,13 @@ define float @float_fsub_acquire(ptr %p) nounwind {
+ ; LA64D-NEXT:    bne $a3, $a2, .LBB1_5
+ ; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB1_3 Depth=2
+-; LA64D-NEXT:    dbar 0
+ ; LA64D-NEXT:    move $a4, $a1
+ ; LA64D-NEXT:    sc.w $a4, $a0, 0
+ ; LA64D-NEXT:    beqz $a4, .LBB1_3
+ ; LA64D-NEXT:    b .LBB1_6
+ ; LA64D-NEXT:  .LBB1_5: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB1_1 Depth=1
+-; LA64D-NEXT:    dbar 1792
++; LA64D-NEXT:    dbar 0
+ ; LA64D-NEXT:  .LBB1_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB1_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+@@ -178,14 +174,13 @@ define float @float_fmin_acquire(ptr %p) nounwind {
+ ; LA64F-NEXT:    bne $a3, $a2, .LBB2_5
+ ; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB2_3 Depth=2
+-; LA64F-NEXT:    dbar 0
+ ; LA64F-NEXT:    move $a4, $a1
+ ; LA64F-NEXT:    sc.w $a4, $a0, 0
+ ; LA64F-NEXT:    beqz $a4, .LBB2_3
+ ; LA64F-NEXT:    b .LBB2_6
+ ; LA64F-NEXT:  .LBB2_5: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB2_1 Depth=1
+-; LA64F-NEXT:    dbar 1792
++; LA64F-NEXT:    dbar 0
+ ; LA64F-NEXT:  .LBB2_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB2_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+@@ -215,14 +210,13 @@ define float @float_fmin_acquire(ptr %p) nounwind {
+ ; LA64D-NEXT:    bne $a3, $a2, .LBB2_5
+ ; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB2_3 Depth=2
+-; LA64D-NEXT:    dbar 0
+ ; LA64D-NEXT:    move $a4, $a1
+ ; LA64D-NEXT:    sc.w $a4, $a0, 0
+ ; LA64D-NEXT:    beqz $a4, .LBB2_3
+ ; LA64D-NEXT:    b .LBB2_6
+ ; LA64D-NEXT:  .LBB2_5: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB2_1 Depth=1
+-; LA64D-NEXT:    dbar 1792
++; LA64D-NEXT:    dbar 0
+ ; LA64D-NEXT:  .LBB2_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB2_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+@@ -256,14 +250,13 @@ define float @float_fmax_acquire(ptr %p) nounwind {
+ ; LA64F-NEXT:    bne $a3, $a2, .LBB3_5
+ ; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB3_3 Depth=2
+-; LA64F-NEXT:    dbar 0
+ ; LA64F-NEXT:    move $a4, $a1
+ ; LA64F-NEXT:    sc.w $a4, $a0, 0
+ ; LA64F-NEXT:    beqz $a4, .LBB3_3
+ ; LA64F-NEXT:    b .LBB3_6
+ ; LA64F-NEXT:  .LBB3_5: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB3_1 Depth=1
+-; LA64F-NEXT:    dbar 1792
++; LA64F-NEXT:    dbar 0
+ ; LA64F-NEXT:  .LBB3_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB3_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+@@ -293,14 +286,13 @@ define float @float_fmax_acquire(ptr %p) nounwind {
+ ; LA64D-NEXT:    bne $a3, $a2, .LBB3_5
+ ; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB3_3 Depth=2
+-; LA64D-NEXT:    dbar 0
+ ; LA64D-NEXT:    move $a4, $a1
+ ; LA64D-NEXT:    sc.w $a4, $a0, 0
+ ; LA64D-NEXT:    beqz $a4, .LBB3_3
+ ; LA64D-NEXT:    b .LBB3_6
+ ; LA64D-NEXT:  .LBB3_5: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB3_1 Depth=1
+-; LA64D-NEXT:    dbar 1792
++; LA64D-NEXT:    dbar 0
+ ; LA64D-NEXT:  .LBB3_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB3_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll
+index cd4a9e7fa9c4..26ba77e8d4fd 100644
+--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll
+@@ -17,7 +17,6 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    and $a6, $a4, $a3
+ ; LA64-NEXT:    move $a5, $a4
+@@ -30,8 +29,6 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA64-NEXT:    sc.w $a5, $a2, 0
+ ; LA64-NEXT:    beqz $a5, .LBB0_1
+ ; LA64-NEXT:  # %bb.4:
+-; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  # %bb.5:
+ ; LA64-NEXT:    srl.w $a0, $a4, $a0
+ ; LA64-NEXT:    ret
+   %1 = atomicrmw umax ptr %a, i8 %b acquire
+@@ -52,7 +49,6 @@ define i16 @atomicrmw_umax_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    and $a6, $a4, $a3
+ ; LA64-NEXT:    move $a5, $a4
+@@ -65,8 +61,6 @@ define i16 @atomicrmw_umax_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA64-NEXT:    sc.w $a5, $a2, 0
+ ; LA64-NEXT:    beqz $a5, .LBB1_1
+ ; LA64-NEXT:  # %bb.4:
+-; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  # %bb.5:
+ ; LA64-NEXT:    srl.w $a0, $a4, $a0
+ ; LA64-NEXT:    ret
+   %1 = atomicrmw umax ptr %a, i16 %b acquire
+@@ -106,7 +100,6 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    and $a6, $a4, $a3
+ ; LA64-NEXT:    move $a5, $a4
+@@ -119,8 +112,6 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA64-NEXT:    sc.w $a5, $a2, 0
+ ; LA64-NEXT:    beqz $a5, .LBB4_1
+ ; LA64-NEXT:  # %bb.4:
+-; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  # %bb.5:
+ ; LA64-NEXT:    srl.w $a0, $a4, $a0
+ ; LA64-NEXT:    ret
+   %1 = atomicrmw umin ptr %a, i8 %b acquire
+@@ -141,7 +132,6 @@ define i16 @atomicrmw_umin_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    and $a6, $a4, $a3
+ ; LA64-NEXT:    move $a5, $a4
+@@ -154,8 +144,6 @@ define i16 @atomicrmw_umin_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA64-NEXT:    sc.w $a5, $a2, 0
+ ; LA64-NEXT:    beqz $a5, .LBB5_1
+ ; LA64-NEXT:  # %bb.4:
+-; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  # %bb.5:
+ ; LA64-NEXT:    srl.w $a0, $a4, $a0
+ ; LA64-NEXT:    ret
+   %1 = atomicrmw umin ptr %a, i16 %b acquire
+@@ -197,7 +185,6 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA64-NEXT:    andi $a4, $a0, 24
+ ; LA64-NEXT:    xori $a4, $a4, 56
+ ; LA64-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.w $a5, $a2, 0
+ ; LA64-NEXT:    and $a7, $a5, $a3
+ ; LA64-NEXT:    move $a6, $a5
+@@ -212,8 +199,6 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA64-NEXT:    sc.w $a6, $a2, 0
+ ; LA64-NEXT:    beqz $a6, .LBB8_1
+ ; LA64-NEXT:  # %bb.4:
+-; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  # %bb.5:
+ ; LA64-NEXT:    srl.w $a0, $a5, $a0
+ ; LA64-NEXT:    ret
+   %1 = atomicrmw max ptr %a, i8 %b acquire
+@@ -237,7 +222,6 @@ define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.w $a5, $a2, 0
+ ; LA64-NEXT:    and $a7, $a5, $a4
+ ; LA64-NEXT:    move $a6, $a5
+@@ -252,8 +236,6 @@ define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA64-NEXT:    sc.w $a6, $a2, 0
+ ; LA64-NEXT:    beqz $a6, .LBB9_1
+ ; LA64-NEXT:  # %bb.4:
+-; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  # %bb.5:
+ ; LA64-NEXT:    srl.w $a0, $a5, $a0
+ ; LA64-NEXT:    ret
+   %1 = atomicrmw max ptr %a, i16 %b acquire
+@@ -295,7 +277,6 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA64-NEXT:    andi $a4, $a0, 24
+ ; LA64-NEXT:    xori $a4, $a4, 56
+ ; LA64-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.w $a5, $a2, 0
+ ; LA64-NEXT:    and $a7, $a5, $a3
+ ; LA64-NEXT:    move $a6, $a5
+@@ -310,8 +291,6 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA64-NEXT:    sc.w $a6, $a2, 0
+ ; LA64-NEXT:    beqz $a6, .LBB12_1
+ ; LA64-NEXT:  # %bb.4:
+-; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  # %bb.5:
+ ; LA64-NEXT:    srl.w $a0, $a5, $a0
+ ; LA64-NEXT:    ret
+   %1 = atomicrmw min ptr %a, i8 %b acquire
+@@ -335,7 +314,6 @@ define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.w $a5, $a2, 0
+ ; LA64-NEXT:    and $a7, $a5, $a4
+ ; LA64-NEXT:    move $a6, $a5
+@@ -350,8 +328,6 @@ define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA64-NEXT:    sc.w $a6, $a2, 0
+ ; LA64-NEXT:    beqz $a6, .LBB13_1
+ ; LA64-NEXT:  # %bb.4:
+-; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  # %bb.5:
+ ; LA64-NEXT:    srl.w $a0, $a5, $a0
+ ; LA64-NEXT:    ret
+   %1 = atomicrmw min ptr %a, i16 %b acquire
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll
+index c077d14f728f..626276ba05f7 100644
+--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll
+@@ -13,7 +13,6 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA32-NEXT:    andi $a1, $a1, 255
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+ ; LA32-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a4, $a2, 0
+ ; LA32-NEXT:    addi.w $a5, $a1, 0
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+@@ -37,7 +36,6 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    addi.w $a5, $a1, 0
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+@@ -64,7 +62,6 @@ define i16 @atomicrmw_xchg_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+ ; LA32-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a4, $a2, 0
+ ; LA32-NEXT:    addi.w $a5, $a1, 0
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+@@ -89,7 +86,6 @@ define i16 @atomicrmw_xchg_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    addi.w $a5, $a1, 0
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+@@ -108,7 +104,6 @@ define i32 @atomicrmw_xchg_i32_acquire(ptr %a, i32 %b) nounwind {
+ ; LA32-LABEL: atomicrmw_xchg_i32_acquire:
+ ; LA32:       # %bb.0:
+ ; LA32-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a2, $a0, 0
+ ; LA32-NEXT:    move $a3, $a1
+ ; LA32-NEXT:    sc.w $a3, $a0, 0
+@@ -157,7 +152,6 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA32-NEXT:    andi $a1, $a1, 255
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+ ; LA32-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a4, $a2, 0
+ ; LA32-NEXT:    add.w $a5, $a4, $a1
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+@@ -181,7 +175,6 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    add.w $a5, $a4, $a1
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+@@ -208,7 +201,6 @@ define i16 @atomicrmw_add_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+ ; LA32-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a4, $a2, 0
+ ; LA32-NEXT:    add.w $a5, $a4, $a1
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+@@ -233,7 +225,6 @@ define i16 @atomicrmw_add_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    add.w $a5, $a4, $a1
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+@@ -252,7 +243,6 @@ define i32 @atomicrmw_add_i32_acquire(ptr %a, i32 %b) nounwind {
+ ; LA32-LABEL: atomicrmw_add_i32_acquire:
+ ; LA32:       # %bb.0:
+ ; LA32-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a2, $a0, 0
+ ; LA32-NEXT:    add.w $a3, $a2, $a1
+ ; LA32-NEXT:    sc.w $a3, $a0, 0
+@@ -301,7 +291,6 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA32-NEXT:    andi $a1, $a1, 255
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+ ; LA32-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a4, $a2, 0
+ ; LA32-NEXT:    sub.w $a5, $a4, $a1
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+@@ -325,7 +314,6 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    sub.w $a5, $a4, $a1
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+@@ -352,7 +340,6 @@ define i16 @atomicrmw_sub_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+ ; LA32-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a4, $a2, 0
+ ; LA32-NEXT:    sub.w $a5, $a4, $a1
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+@@ -377,7 +364,6 @@ define i16 @atomicrmw_sub_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    sub.w $a5, $a4, $a1
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+@@ -396,7 +382,6 @@ define i32 @atomicrmw_sub_i32_acquire(ptr %a, i32 %b) nounwind {
+ ; LA32-LABEL: atomicrmw_sub_i32_acquire:
+ ; LA32:       # %bb.0:
+ ; LA32-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a2, $a0, 0
+ ; LA32-NEXT:    sub.w $a3, $a2, $a1
+ ; LA32-NEXT:    sc.w $a3, $a0, 0
+@@ -447,7 +432,6 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA32-NEXT:    andi $a1, $a1, 255
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+ ; LA32-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a4, $a2, 0
+ ; LA32-NEXT:    and $a5, $a4, $a1
+ ; LA32-NEXT:    nor $a5, $a5, $zero
+@@ -472,7 +456,6 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    and $a5, $a4, $a1
+ ; LA64-NEXT:    nor $a5, $a5, $zero
+@@ -500,7 +483,6 @@ define i16 @atomicrmw_nand_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+ ; LA32-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a4, $a2, 0
+ ; LA32-NEXT:    and $a5, $a4, $a1
+ ; LA32-NEXT:    nor $a5, $a5, $zero
+@@ -526,7 +508,6 @@ define i16 @atomicrmw_nand_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    and $a5, $a4, $a1
+ ; LA64-NEXT:    nor $a5, $a5, $zero
+@@ -546,7 +527,6 @@ define i32 @atomicrmw_nand_i32_acquire(ptr %a, i32 %b) nounwind {
+ ; LA32-LABEL: atomicrmw_nand_i32_acquire:
+ ; LA32:       # %bb.0:
+ ; LA32-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a2, $a0, 0
+ ; LA32-NEXT:    and $a3, $a2, $a1
+ ; LA32-NEXT:    nor $a3, $a3, $zero
+@@ -559,7 +539,6 @@ define i32 @atomicrmw_nand_i32_acquire(ptr %a, i32 %b) nounwind {
+ ; LA64-LABEL: atomicrmw_nand_i32_acquire:
+ ; LA64:       # %bb.0:
+ ; LA64-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.w $a2, $a0, 0
+ ; LA64-NEXT:    and $a3, $a2, $a1
+ ; LA64-NEXT:    nor $a3, $a3, $zero
+@@ -586,7 +565,6 @@ define i64 @atomicrmw_nand_i64_acquire(ptr %a, i64 %b) nounwind {
+ ; LA64-LABEL: atomicrmw_nand_i64_acquire:
+ ; LA64:       # %bb.0:
+ ; LA64-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    dbar 0
+ ; LA64-NEXT:    ll.d $a2, $a0, 0
+ ; LA64-NEXT:    and $a3, $a2, $a1
+ ; LA64-NEXT:    nor $a3, $a3, $zero
+@@ -611,7 +589,6 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA32-NEXT:    addi.w $a3, $zero, -4
+ ; LA32-NEXT:    and $a0, $a0, $a3
+ ; LA32-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a3, $a0, 0
+ ; LA32-NEXT:    and $a4, $a3, $a1
+ ; LA32-NEXT:    sc.w $a4, $a0, 0
+@@ -650,7 +627,6 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA32-NEXT:    addi.w $a2, $zero, -4
+ ; LA32-NEXT:    and $a0, $a0, $a2
+ ; LA32-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a2, $a0, 0
+ ; LA32-NEXT:    and $a4, $a2, $a1
+ ; LA32-NEXT:    sc.w $a4, $a0, 0
+@@ -681,7 +657,6 @@ define i32 @atomicrmw_and_i32_acquire(ptr %a, i32 %b) nounwind {
+ ; LA32-LABEL: atomicrmw_and_i32_acquire:
+ ; LA32:       # %bb.0:
+ ; LA32-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a2, $a0, 0
+ ; LA32-NEXT:    and $a3, $a2, $a1
+ ; LA32-NEXT:    sc.w $a3, $a0, 0
+@@ -728,7 +703,6 @@ define i8 @atomicrmw_or_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA32-NEXT:    andi $a1, $a1, 255
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+ ; LA32-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a3, $a2, 0
+ ; LA32-NEXT:    or $a4, $a3, $a1
+ ; LA32-NEXT:    sc.w $a4, $a2, 0
+@@ -760,7 +734,6 @@ define i16 @atomicrmw_or_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+ ; LA32-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a3, $a2, 0
+ ; LA32-NEXT:    or $a4, $a3, $a1
+ ; LA32-NEXT:    sc.w $a4, $a2, 0
+@@ -787,7 +760,6 @@ define i32 @atomicrmw_or_i32_acquire(ptr %a, i32 %b) nounwind {
+ ; LA32-LABEL: atomicrmw_or_i32_acquire:
+ ; LA32:       # %bb.0:
+ ; LA32-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a2, $a0, 0
+ ; LA32-NEXT:    or $a3, $a2, $a1
+ ; LA32-NEXT:    sc.w $a3, $a0, 0
+@@ -834,7 +806,6 @@ define i8 @atomicrmw_xor_i8_acquire(ptr %a, i8 %b) nounwind {
+ ; LA32-NEXT:    andi $a1, $a1, 255
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+ ; LA32-NEXT:  .LBB24_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a3, $a2, 0
+ ; LA32-NEXT:    xor $a4, $a3, $a1
+ ; LA32-NEXT:    sc.w $a4, $a2, 0
+@@ -866,7 +837,6 @@ define i16 @atomicrmw_xor_i16_acquire(ptr %a, i16 %b) nounwind {
+ ; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+ ; LA32-NEXT:  .LBB25_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a3, $a2, 0
+ ; LA32-NEXT:    xor $a4, $a3, $a1
+ ; LA32-NEXT:    sc.w $a4, $a2, 0
+@@ -893,7 +863,6 @@ define i32 @atomicrmw_xor_i32_acquire(ptr %a, i32 %b) nounwind {
+ ; LA32-LABEL: atomicrmw_xor_i32_acquire:
+ ; LA32:       # %bb.0:
+ ; LA32-NEXT:  .LBB26_1: # =>This Inner Loop Header: Depth=1
+-; LA32-NEXT:    dbar 0
+ ; LA32-NEXT:    ll.w $a2, $a0, 0
+ ; LA32-NEXT:    xor $a3, $a2, $a1
+ ; LA32-NEXT:    sc.w $a3, $a0, 0
+diff --git a/llvm/unittests/Target/LoongArch/InstSizes.cpp b/llvm/unittests/Target/LoongArch/InstSizes.cpp
+index 1a5d4369c48b..3180c7237a79 100644
+--- a/llvm/unittests/Target/LoongArch/InstSizes.cpp
++++ b/llvm/unittests/Target/LoongArch/InstSizes.cpp
+@@ -121,7 +121,7 @@ TEST(InstSizes, AtomicPseudo) {
+       "    dead early-clobber renamable $r10, dead early-clobber renamable $r11 = PseudoAtomicLoadAdd32 renamable $r7, renamable $r6, renamable $r8\n"
+       "    dead early-clobber renamable $r5, dead early-clobber renamable $r9, dead early-clobber renamable $r10 = PseudoMaskedAtomicLoadUMax32 renamable $r7, renamable $r6, renamable $r8, 4\n"
+       "    early-clobber renamable $r9, dead early-clobber renamable $r10, dead early-clobber renamable $r11 = PseudoMaskedAtomicLoadMax32 killed renamable $r6, killed renamable $r5, killed renamable $r7, killed renamable $r8, 4\n"
+-      "    dead early-clobber renamable $r5, dead early-clobber renamable $r9 = PseudoCmpXchg32 renamable $r7, renamable $r4, renamable $r6\n"
++      "    dead early-clobber renamable $r5, dead early-clobber renamable $r9 = PseudoCmpXchg32 renamable $r7, renamable $r4, renamable $r6, 4\n"
+       "    dead early-clobber renamable $r5, dead early-clobber renamable $r9 = PseudoMaskedCmpXchg32 killed renamable $r7, killed renamable $r4, killed renamable $r6, killed renamable $r8, 4\n",
+       // clang-format on
+       [](LoongArchInstrInfo &II, MachineFunction &MF) {
+-- 
+2.20.1
+
+
+From 5f2a6174965bccaeefdeb410cf67ea0cb378b26c Mon Sep 17 00:00:00 2001
+From: hev <wangrui@loongson.cn>
+Date: Wed, 11 Oct 2023 18:28:04 +0800
+Subject: [PATCH 2/7] [LoongArch] Add some atomic tests (#68766)
+
+(cherry picked from commit 37b93f07cd7ba2b1e6e81116cd49d34396b7b70a)
+---
+ .../LoongArch/ir-instruction/atomicrmw-fp.ll  | 2714 ++++++++++++-
+ .../ir-instruction/atomicrmw-minmax.ll        | 1400 +++++++
+ .../LoongArch/ir-instruction/atomicrmw.ll     | 3346 ++++++++++++++++-
+ .../ir-instruction/fence-singlethread.ll      |   17 +
+ .../ir-instruction/load-store-atomic.ll       |  196 +
+ 5 files changed, 7609 insertions(+), 64 deletions(-)
+ create mode 100644 llvm/test/CodeGen/LoongArch/ir-instruction/fence-singlethread.ll
+
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+index 9a29d67e9982..02d481cb3865 100644
+--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+@@ -2,8 +2,6 @@
+ ; RUN: llc --mtriple=loongarch64 --mattr=+f,-d < %s | FileCheck %s --check-prefix=LA64F
+ ; RUN: llc --mtriple=loongarch64 --mattr=+d < %s | FileCheck %s --check-prefix=LA64D
+ 
+-;; Add more test cases after supporting different AtomicOrdering.
+-
+ define float @float_fadd_acquire(ptr %p) nounwind {
+ ; LA64F-LABEL: float_fadd_acquire:
+ ; LA64F:       # %bb.0:
+@@ -681,3 +679,2715 @@ define double @double_fmax_acquire(ptr %p) nounwind {
+   %v = atomicrmw fmax ptr %p, double 1.0 acquire, align 4
+   ret double %v
+ }
++
++define float @float_fadd_release(ptr %p) nounwind {
++; LA64F-LABEL: float_fadd_release:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    fld.s $fa0, $a0, 0
++; LA64F-NEXT:    addi.w $a1, $zero, 1
++; LA64F-NEXT:    movgr2fr.w $fa1, $a1
++; LA64F-NEXT:    ffint.s.w $fa1, $fa1
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB8_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Loop Header: Depth=1
++; LA64F-NEXT:    # Child Loop BB8_3 Depth 2
++; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
++; LA64F-NEXT:    movfr2gr.s $a1, $fa2
++; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:  .LBB8_3: # %atomicrmw.start
++; LA64F-NEXT:    # Parent Loop BB8_1 Depth=1
++; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64F-NEXT:    ll.w $a3, $a0, 0
++; LA64F-NEXT:    bne $a3, $a2, .LBB8_5
++; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB8_3 Depth=2
++; LA64F-NEXT:    move $a4, $a1
++; LA64F-NEXT:    sc.w $a4, $a0, 0
++; LA64F-NEXT:    beqz $a4, .LBB8_3
++; LA64F-NEXT:    b .LBB8_6
++; LA64F-NEXT:  .LBB8_5: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB8_1 Depth=1
++; LA64F-NEXT:    dbar 1792
++; LA64F-NEXT:  .LBB8_6: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB8_1 Depth=1
++; LA64F-NEXT:    movgr2fr.w $fa0, $a3
++; LA64F-NEXT:    addi.w $a1, $a2, 0
++; LA64F-NEXT:    bne $a3, $a1, .LBB8_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: float_fadd_release:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    fld.s $fa0, $a0, 0
++; LA64D-NEXT:    addi.w $a1, $zero, 1
++; LA64D-NEXT:    movgr2fr.w $fa1, $a1
++; LA64D-NEXT:    ffint.s.w $fa1, $fa1
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB8_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Loop Header: Depth=1
++; LA64D-NEXT:    # Child Loop BB8_3 Depth 2
++; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
++; LA64D-NEXT:    movfr2gr.s $a1, $fa2
++; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:  .LBB8_3: # %atomicrmw.start
++; LA64D-NEXT:    # Parent Loop BB8_1 Depth=1
++; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64D-NEXT:    ll.w $a3, $a0, 0
++; LA64D-NEXT:    bne $a3, $a2, .LBB8_5
++; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB8_3 Depth=2
++; LA64D-NEXT:    move $a4, $a1
++; LA64D-NEXT:    sc.w $a4, $a0, 0
++; LA64D-NEXT:    beqz $a4, .LBB8_3
++; LA64D-NEXT:    b .LBB8_6
++; LA64D-NEXT:  .LBB8_5: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB8_1 Depth=1
++; LA64D-NEXT:    dbar 1792
++; LA64D-NEXT:  .LBB8_6: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB8_1 Depth=1
++; LA64D-NEXT:    movgr2fr.w $fa0, $a3
++; LA64D-NEXT:    addi.w $a1, $a2, 0
++; LA64D-NEXT:    bne $a3, $a1, .LBB8_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    ret
++  %v = atomicrmw fadd ptr %p, float 1.0 release, align 4
++  ret float %v
++}
++
++define float @float_fsub_release(ptr %p) nounwind {
++; LA64F-LABEL: float_fsub_release:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    fld.s $fa0, $a0, 0
++; LA64F-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI9_0)
++; LA64F-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI9_0)
++; LA64F-NEXT:    fld.s $fa1, $a1, 0
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB9_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Loop Header: Depth=1
++; LA64F-NEXT:    # Child Loop BB9_3 Depth 2
++; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
++; LA64F-NEXT:    movfr2gr.s $a1, $fa2
++; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:  .LBB9_3: # %atomicrmw.start
++; LA64F-NEXT:    # Parent Loop BB9_1 Depth=1
++; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64F-NEXT:    ll.w $a3, $a0, 0
++; LA64F-NEXT:    bne $a3, $a2, .LBB9_5
++; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB9_3 Depth=2
++; LA64F-NEXT:    move $a4, $a1
++; LA64F-NEXT:    sc.w $a4, $a0, 0
++; LA64F-NEXT:    beqz $a4, .LBB9_3
++; LA64F-NEXT:    b .LBB9_6
++; LA64F-NEXT:  .LBB9_5: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB9_1 Depth=1
++; LA64F-NEXT:    dbar 1792
++; LA64F-NEXT:  .LBB9_6: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB9_1 Depth=1
++; LA64F-NEXT:    movgr2fr.w $fa0, $a3
++; LA64F-NEXT:    addi.w $a1, $a2, 0
++; LA64F-NEXT:    bne $a3, $a1, .LBB9_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: float_fsub_release:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    fld.s $fa0, $a0, 0
++; LA64D-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI9_0)
++; LA64D-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI9_0)
++; LA64D-NEXT:    fld.s $fa1, $a1, 0
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB9_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Loop Header: Depth=1
++; LA64D-NEXT:    # Child Loop BB9_3 Depth 2
++; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
++; LA64D-NEXT:    movfr2gr.s $a1, $fa2
++; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:  .LBB9_3: # %atomicrmw.start
++; LA64D-NEXT:    # Parent Loop BB9_1 Depth=1
++; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64D-NEXT:    ll.w $a3, $a0, 0
++; LA64D-NEXT:    bne $a3, $a2, .LBB9_5
++; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB9_3 Depth=2
++; LA64D-NEXT:    move $a4, $a1
++; LA64D-NEXT:    sc.w $a4, $a0, 0
++; LA64D-NEXT:    beqz $a4, .LBB9_3
++; LA64D-NEXT:    b .LBB9_6
++; LA64D-NEXT:  .LBB9_5: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB9_1 Depth=1
++; LA64D-NEXT:    dbar 1792
++; LA64D-NEXT:  .LBB9_6: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB9_1 Depth=1
++; LA64D-NEXT:    movgr2fr.w $fa0, $a3
++; LA64D-NEXT:    addi.w $a1, $a2, 0
++; LA64D-NEXT:    bne $a3, $a1, .LBB9_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    ret
++  %v = atomicrmw fsub ptr %p, float 1.0 release, align 4
++  ret float %v
++}
++
++define float @float_fmin_release(ptr %p) nounwind {
++; LA64F-LABEL: float_fmin_release:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    fld.s $fa0, $a0, 0
++; LA64F-NEXT:    addi.w $a1, $zero, 1
++; LA64F-NEXT:    movgr2fr.w $fa1, $a1
++; LA64F-NEXT:    ffint.s.w $fa1, $fa1
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB10_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Loop Header: Depth=1
++; LA64F-NEXT:    # Child Loop BB10_3 Depth 2
++; LA64F-NEXT:    fmax.s $fa2, $fa0, $fa0
++; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
++; LA64F-NEXT:    movfr2gr.s $a1, $fa2
++; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:  .LBB10_3: # %atomicrmw.start
++; LA64F-NEXT:    # Parent Loop BB10_1 Depth=1
++; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64F-NEXT:    ll.w $a3, $a0, 0
++; LA64F-NEXT:    bne $a3, $a2, .LBB10_5
++; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB10_3 Depth=2
++; LA64F-NEXT:    move $a4, $a1
++; LA64F-NEXT:    sc.w $a4, $a0, 0
++; LA64F-NEXT:    beqz $a4, .LBB10_3
++; LA64F-NEXT:    b .LBB10_6
++; LA64F-NEXT:  .LBB10_5: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB10_1 Depth=1
++; LA64F-NEXT:    dbar 1792
++; LA64F-NEXT:  .LBB10_6: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB10_1 Depth=1
++; LA64F-NEXT:    movgr2fr.w $fa0, $a3
++; LA64F-NEXT:    addi.w $a1, $a2, 0
++; LA64F-NEXT:    bne $a3, $a1, .LBB10_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: float_fmin_release:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    fld.s $fa0, $a0, 0
++; LA64D-NEXT:    addi.w $a1, $zero, 1
++; LA64D-NEXT:    movgr2fr.w $fa1, $a1
++; LA64D-NEXT:    ffint.s.w $fa1, $fa1
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB10_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Loop Header: Depth=1
++; LA64D-NEXT:    # Child Loop BB10_3 Depth 2
++; LA64D-NEXT:    fmax.s $fa2, $fa0, $fa0
++; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
++; LA64D-NEXT:    movfr2gr.s $a1, $fa2
++; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:  .LBB10_3: # %atomicrmw.start
++; LA64D-NEXT:    # Parent Loop BB10_1 Depth=1
++; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64D-NEXT:    ll.w $a3, $a0, 0
++; LA64D-NEXT:    bne $a3, $a2, .LBB10_5
++; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB10_3 Depth=2
++; LA64D-NEXT:    move $a4, $a1
++; LA64D-NEXT:    sc.w $a4, $a0, 0
++; LA64D-NEXT:    beqz $a4, .LBB10_3
++; LA64D-NEXT:    b .LBB10_6
++; LA64D-NEXT:  .LBB10_5: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB10_1 Depth=1
++; LA64D-NEXT:    dbar 1792
++; LA64D-NEXT:  .LBB10_6: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB10_1 Depth=1
++; LA64D-NEXT:    movgr2fr.w $fa0, $a3
++; LA64D-NEXT:    addi.w $a1, $a2, 0
++; LA64D-NEXT:    bne $a3, $a1, .LBB10_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    ret
++  %v = atomicrmw fmin ptr %p, float 1.0 release, align 4
++  ret float %v
++}
++
++define float @float_fmax_release(ptr %p) nounwind {
++; LA64F-LABEL: float_fmax_release:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    fld.s $fa0, $a0, 0
++; LA64F-NEXT:    addi.w $a1, $zero, 1
++; LA64F-NEXT:    movgr2fr.w $fa1, $a1
++; LA64F-NEXT:    ffint.s.w $fa1, $fa1
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB11_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Loop Header: Depth=1
++; LA64F-NEXT:    # Child Loop BB11_3 Depth 2
++; LA64F-NEXT:    fmax.s $fa2, $fa0, $fa0
++; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
++; LA64F-NEXT:    movfr2gr.s $a1, $fa2
++; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:  .LBB11_3: # %atomicrmw.start
++; LA64F-NEXT:    # Parent Loop BB11_1 Depth=1
++; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64F-NEXT:    ll.w $a3, $a0, 0
++; LA64F-NEXT:    bne $a3, $a2, .LBB11_5
++; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB11_3 Depth=2
++; LA64F-NEXT:    move $a4, $a1
++; LA64F-NEXT:    sc.w $a4, $a0, 0
++; LA64F-NEXT:    beqz $a4, .LBB11_3
++; LA64F-NEXT:    b .LBB11_6
++; LA64F-NEXT:  .LBB11_5: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB11_1 Depth=1
++; LA64F-NEXT:    dbar 1792
++; LA64F-NEXT:  .LBB11_6: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB11_1 Depth=1
++; LA64F-NEXT:    movgr2fr.w $fa0, $a3
++; LA64F-NEXT:    addi.w $a1, $a2, 0
++; LA64F-NEXT:    bne $a3, $a1, .LBB11_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: float_fmax_release:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    fld.s $fa0, $a0, 0
++; LA64D-NEXT:    addi.w $a1, $zero, 1
++; LA64D-NEXT:    movgr2fr.w $fa1, $a1
++; LA64D-NEXT:    ffint.s.w $fa1, $fa1
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB11_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Loop Header: Depth=1
++; LA64D-NEXT:    # Child Loop BB11_3 Depth 2
++; LA64D-NEXT:    fmax.s $fa2, $fa0, $fa0
++; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
++; LA64D-NEXT:    movfr2gr.s $a1, $fa2
++; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:  .LBB11_3: # %atomicrmw.start
++; LA64D-NEXT:    # Parent Loop BB11_1 Depth=1
++; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64D-NEXT:    ll.w $a3, $a0, 0
++; LA64D-NEXT:    bne $a3, $a2, .LBB11_5
++; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB11_3 Depth=2
++; LA64D-NEXT:    move $a4, $a1
++; LA64D-NEXT:    sc.w $a4, $a0, 0
++; LA64D-NEXT:    beqz $a4, .LBB11_3
++; LA64D-NEXT:    b .LBB11_6
++; LA64D-NEXT:  .LBB11_5: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB11_1 Depth=1
++; LA64D-NEXT:    dbar 1792
++; LA64D-NEXT:  .LBB11_6: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB11_1 Depth=1
++; LA64D-NEXT:    movgr2fr.w $fa0, $a3
++; LA64D-NEXT:    addi.w $a1, $a2, 0
++; LA64D-NEXT:    bne $a3, $a1, .LBB11_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    ret
++  %v = atomicrmw fmax ptr %p, float 1.0 release, align 4
++  ret float %v
++}
++
++define double @double_fadd_release(ptr %p) nounwind {
++; LA64F-LABEL: double_fadd_release:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    addi.d $sp, $sp, -80
++; LA64F-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
++; LA64F-NEXT:    move $fp, $a0
++; LA64F-NEXT:    ld.d $a0, $a0, 0
++; LA64F-NEXT:    lu52i.d $s0, $zero, 1023
++; LA64F-NEXT:    ori $s1, $zero, 8
++; LA64F-NEXT:    addi.d $s2, $sp, 16
++; LA64F-NEXT:    addi.d $s3, $sp, 8
++; LA64F-NEXT:    ori $s4, $zero, 3
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB12_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64F-NEXT:    st.d $a0, $sp, 16
++; LA64F-NEXT:    move $a1, $s0
++; LA64F-NEXT:    bl %plt(__adddf3)
++; LA64F-NEXT:    st.d $a0, $sp, 8
++; LA64F-NEXT:    move $a0, $s1
++; LA64F-NEXT:    move $a1, $fp
++; LA64F-NEXT:    move $a2, $s2
++; LA64F-NEXT:    move $a3, $s3
++; LA64F-NEXT:    move $a4, $s4
++; LA64F-NEXT:    move $a5, $zero
++; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64F-NEXT:    move $a1, $a0
++; LA64F-NEXT:    ld.d $a0, $sp, 16
++; LA64F-NEXT:    beqz $a1, .LBB12_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64F-NEXT:    addi.d $sp, $sp, 80
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: double_fadd_release:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    addi.d $sp, $sp, -80
++; LA64D-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64D-NEXT:    fst.d $fs0, $sp, 24 # 8-byte Folded Spill
++; LA64D-NEXT:    move $fp, $a0
++; LA64D-NEXT:    fld.d $fa0, $a0, 0
++; LA64D-NEXT:    addi.d $a0, $zero, 1
++; LA64D-NEXT:    movgr2fr.d $fa1, $a0
++; LA64D-NEXT:    ffint.d.l $fs0, $fa1
++; LA64D-NEXT:    ori $s0, $zero, 8
++; LA64D-NEXT:    addi.d $s1, $sp, 16
++; LA64D-NEXT:    addi.d $s2, $sp, 8
++; LA64D-NEXT:    ori $s3, $zero, 3
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB12_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64D-NEXT:    fst.d $fa0, $sp, 16
++; LA64D-NEXT:    fadd.d $fa0, $fa0, $fs0
++; LA64D-NEXT:    fst.d $fa0, $sp, 8
++; LA64D-NEXT:    move $a0, $s0
++; LA64D-NEXT:    move $a1, $fp
++; LA64D-NEXT:    move $a2, $s1
++; LA64D-NEXT:    move $a3, $s2
++; LA64D-NEXT:    move $a4, $s3
++; LA64D-NEXT:    move $a5, $zero
++; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64D-NEXT:    fld.d $fa0, $sp, 16
++; LA64D-NEXT:    beqz $a0, .LBB12_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    fld.d $fs0, $sp, 24 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64D-NEXT:    addi.d $sp, $sp, 80
++; LA64D-NEXT:    ret
++  %v = atomicrmw fadd ptr %p, double 1.0 release, align 4
++  ret double %v
++}
++
++define double @double_fsub_release(ptr %p) nounwind {
++; LA64F-LABEL: double_fsub_release:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    addi.d $sp, $sp, -80
++; LA64F-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
++; LA64F-NEXT:    move $fp, $a0
++; LA64F-NEXT:    ld.d $a0, $a0, 0
++; LA64F-NEXT:    lu52i.d $s0, $zero, -1025
++; LA64F-NEXT:    ori $s1, $zero, 8
++; LA64F-NEXT:    addi.d $s2, $sp, 16
++; LA64F-NEXT:    addi.d $s3, $sp, 8
++; LA64F-NEXT:    ori $s4, $zero, 3
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB13_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64F-NEXT:    st.d $a0, $sp, 16
++; LA64F-NEXT:    move $a1, $s0
++; LA64F-NEXT:    bl %plt(__adddf3)
++; LA64F-NEXT:    st.d $a0, $sp, 8
++; LA64F-NEXT:    move $a0, $s1
++; LA64F-NEXT:    move $a1, $fp
++; LA64F-NEXT:    move $a2, $s2
++; LA64F-NEXT:    move $a3, $s3
++; LA64F-NEXT:    move $a4, $s4
++; LA64F-NEXT:    move $a5, $zero
++; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64F-NEXT:    move $a1, $a0
++; LA64F-NEXT:    ld.d $a0, $sp, 16
++; LA64F-NEXT:    beqz $a1, .LBB13_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64F-NEXT:    addi.d $sp, $sp, 80
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: double_fsub_release:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    addi.d $sp, $sp, -80
++; LA64D-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64D-NEXT:    fst.d $fs0, $sp, 24 # 8-byte Folded Spill
++; LA64D-NEXT:    move $fp, $a0
++; LA64D-NEXT:    fld.d $fa0, $a0, 0
++; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI13_0)
++; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI13_0)
++; LA64D-NEXT:    fld.d $fs0, $a0, 0
++; LA64D-NEXT:    ori $s0, $zero, 8
++; LA64D-NEXT:    addi.d $s1, $sp, 16
++; LA64D-NEXT:    addi.d $s2, $sp, 8
++; LA64D-NEXT:    ori $s3, $zero, 3
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB13_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64D-NEXT:    fst.d $fa0, $sp, 16
++; LA64D-NEXT:    fadd.d $fa0, $fa0, $fs0
++; LA64D-NEXT:    fst.d $fa0, $sp, 8
++; LA64D-NEXT:    move $a0, $s0
++; LA64D-NEXT:    move $a1, $fp
++; LA64D-NEXT:    move $a2, $s1
++; LA64D-NEXT:    move $a3, $s2
++; LA64D-NEXT:    move $a4, $s3
++; LA64D-NEXT:    move $a5, $zero
++; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64D-NEXT:    fld.d $fa0, $sp, 16
++; LA64D-NEXT:    beqz $a0, .LBB13_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    fld.d $fs0, $sp, 24 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64D-NEXT:    addi.d $sp, $sp, 80
++; LA64D-NEXT:    ret
++  %v = atomicrmw fsub ptr %p, double 1.0 release, align 4
++  ret double %v
++}
++
++define double @double_fmin_release(ptr %p) nounwind {
++; LA64F-LABEL: double_fmin_release:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    addi.d $sp, $sp, -80
++; LA64F-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
++; LA64F-NEXT:    move $fp, $a0
++; LA64F-NEXT:    ld.d $a0, $a0, 0
++; LA64F-NEXT:    lu52i.d $s0, $zero, 1023
++; LA64F-NEXT:    ori $s1, $zero, 8
++; LA64F-NEXT:    addi.d $s2, $sp, 16
++; LA64F-NEXT:    addi.d $s3, $sp, 8
++; LA64F-NEXT:    ori $s4, $zero, 3
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB14_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64F-NEXT:    st.d $a0, $sp, 16
++; LA64F-NEXT:    move $a1, $s0
++; LA64F-NEXT:    bl %plt(fmin)
++; LA64F-NEXT:    st.d $a0, $sp, 8
++; LA64F-NEXT:    move $a0, $s1
++; LA64F-NEXT:    move $a1, $fp
++; LA64F-NEXT:    move $a2, $s2
++; LA64F-NEXT:    move $a3, $s3
++; LA64F-NEXT:    move $a4, $s4
++; LA64F-NEXT:    move $a5, $zero
++; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64F-NEXT:    move $a1, $a0
++; LA64F-NEXT:    ld.d $a0, $sp, 16
++; LA64F-NEXT:    beqz $a1, .LBB14_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64F-NEXT:    addi.d $sp, $sp, 80
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: double_fmin_release:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    addi.d $sp, $sp, -80
++; LA64D-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64D-NEXT:    fst.d $fs0, $sp, 24 # 8-byte Folded Spill
++; LA64D-NEXT:    move $fp, $a0
++; LA64D-NEXT:    fld.d $fa0, $a0, 0
++; LA64D-NEXT:    addi.d $a0, $zero, 1
++; LA64D-NEXT:    movgr2fr.d $fa1, $a0
++; LA64D-NEXT:    ffint.d.l $fs0, $fa1
++; LA64D-NEXT:    ori $s0, $zero, 8
++; LA64D-NEXT:    addi.d $s1, $sp, 16
++; LA64D-NEXT:    addi.d $s2, $sp, 8
++; LA64D-NEXT:    ori $s3, $zero, 3
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB14_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64D-NEXT:    fst.d $fa0, $sp, 16
++; LA64D-NEXT:    fmax.d $fa0, $fa0, $fa0
++; LA64D-NEXT:    fmin.d $fa0, $fa0, $fs0
++; LA64D-NEXT:    fst.d $fa0, $sp, 8
++; LA64D-NEXT:    move $a0, $s0
++; LA64D-NEXT:    move $a1, $fp
++; LA64D-NEXT:    move $a2, $s1
++; LA64D-NEXT:    move $a3, $s2
++; LA64D-NEXT:    move $a4, $s3
++; LA64D-NEXT:    move $a5, $zero
++; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64D-NEXT:    fld.d $fa0, $sp, 16
++; LA64D-NEXT:    beqz $a0, .LBB14_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    fld.d $fs0, $sp, 24 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64D-NEXT:    addi.d $sp, $sp, 80
++; LA64D-NEXT:    ret
++  %v = atomicrmw fmin ptr %p, double 1.0 release, align 4
++  ret double %v
++}
++
++define double @double_fmax_release(ptr %p) nounwind {
++; LA64F-LABEL: double_fmax_release:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    addi.d $sp, $sp, -80
++; LA64F-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
++; LA64F-NEXT:    move $fp, $a0
++; LA64F-NEXT:    ld.d $a0, $a0, 0
++; LA64F-NEXT:    lu52i.d $s0, $zero, 1023
++; LA64F-NEXT:    ori $s1, $zero, 8
++; LA64F-NEXT:    addi.d $s2, $sp, 16
++; LA64F-NEXT:    addi.d $s3, $sp, 8
++; LA64F-NEXT:    ori $s4, $zero, 3
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB15_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64F-NEXT:    st.d $a0, $sp, 16
++; LA64F-NEXT:    move $a1, $s0
++; LA64F-NEXT:    bl %plt(fmax)
++; LA64F-NEXT:    st.d $a0, $sp, 8
++; LA64F-NEXT:    move $a0, $s1
++; LA64F-NEXT:    move $a1, $fp
++; LA64F-NEXT:    move $a2, $s2
++; LA64F-NEXT:    move $a3, $s3
++; LA64F-NEXT:    move $a4, $s4
++; LA64F-NEXT:    move $a5, $zero
++; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64F-NEXT:    move $a1, $a0
++; LA64F-NEXT:    ld.d $a0, $sp, 16
++; LA64F-NEXT:    beqz $a1, .LBB15_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64F-NEXT:    addi.d $sp, $sp, 80
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: double_fmax_release:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    addi.d $sp, $sp, -80
++; LA64D-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64D-NEXT:    fst.d $fs0, $sp, 24 # 8-byte Folded Spill
++; LA64D-NEXT:    move $fp, $a0
++; LA64D-NEXT:    fld.d $fa0, $a0, 0
++; LA64D-NEXT:    addi.d $a0, $zero, 1
++; LA64D-NEXT:    movgr2fr.d $fa1, $a0
++; LA64D-NEXT:    ffint.d.l $fs0, $fa1
++; LA64D-NEXT:    ori $s0, $zero, 8
++; LA64D-NEXT:    addi.d $s1, $sp, 16
++; LA64D-NEXT:    addi.d $s2, $sp, 8
++; LA64D-NEXT:    ori $s3, $zero, 3
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB15_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64D-NEXT:    fst.d $fa0, $sp, 16
++; LA64D-NEXT:    fmax.d $fa0, $fa0, $fa0
++; LA64D-NEXT:    fmax.d $fa0, $fa0, $fs0
++; LA64D-NEXT:    fst.d $fa0, $sp, 8
++; LA64D-NEXT:    move $a0, $s0
++; LA64D-NEXT:    move $a1, $fp
++; LA64D-NEXT:    move $a2, $s1
++; LA64D-NEXT:    move $a3, $s2
++; LA64D-NEXT:    move $a4, $s3
++; LA64D-NEXT:    move $a5, $zero
++; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64D-NEXT:    fld.d $fa0, $sp, 16
++; LA64D-NEXT:    beqz $a0, .LBB15_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    fld.d $fs0, $sp, 24 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64D-NEXT:    addi.d $sp, $sp, 80
++; LA64D-NEXT:    ret
++  %v = atomicrmw fmax ptr %p, double 1.0 release, align 4
++  ret double %v
++}
++
++define float @float_fadd_acq_rel(ptr %p) nounwind {
++; LA64F-LABEL: float_fadd_acq_rel:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    fld.s $fa0, $a0, 0
++; LA64F-NEXT:    addi.w $a1, $zero, 1
++; LA64F-NEXT:    movgr2fr.w $fa1, $a1
++; LA64F-NEXT:    ffint.s.w $fa1, $fa1
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB16_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Loop Header: Depth=1
++; LA64F-NEXT:    # Child Loop BB16_3 Depth 2
++; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
++; LA64F-NEXT:    movfr2gr.s $a1, $fa2
++; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:  .LBB16_3: # %atomicrmw.start
++; LA64F-NEXT:    # Parent Loop BB16_1 Depth=1
++; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64F-NEXT:    ll.w $a3, $a0, 0
++; LA64F-NEXT:    bne $a3, $a2, .LBB16_5
++; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB16_3 Depth=2
++; LA64F-NEXT:    move $a4, $a1
++; LA64F-NEXT:    sc.w $a4, $a0, 0
++; LA64F-NEXT:    beqz $a4, .LBB16_3
++; LA64F-NEXT:    b .LBB16_6
++; LA64F-NEXT:  .LBB16_5: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB16_1 Depth=1
++; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:  .LBB16_6: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB16_1 Depth=1
++; LA64F-NEXT:    movgr2fr.w $fa0, $a3
++; LA64F-NEXT:    addi.w $a1, $a2, 0
++; LA64F-NEXT:    bne $a3, $a1, .LBB16_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: float_fadd_acq_rel:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    fld.s $fa0, $a0, 0
++; LA64D-NEXT:    addi.w $a1, $zero, 1
++; LA64D-NEXT:    movgr2fr.w $fa1, $a1
++; LA64D-NEXT:    ffint.s.w $fa1, $fa1
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB16_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Loop Header: Depth=1
++; LA64D-NEXT:    # Child Loop BB16_3 Depth 2
++; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
++; LA64D-NEXT:    movfr2gr.s $a1, $fa2
++; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:  .LBB16_3: # %atomicrmw.start
++; LA64D-NEXT:    # Parent Loop BB16_1 Depth=1
++; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64D-NEXT:    ll.w $a3, $a0, 0
++; LA64D-NEXT:    bne $a3, $a2, .LBB16_5
++; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB16_3 Depth=2
++; LA64D-NEXT:    move $a4, $a1
++; LA64D-NEXT:    sc.w $a4, $a0, 0
++; LA64D-NEXT:    beqz $a4, .LBB16_3
++; LA64D-NEXT:    b .LBB16_6
++; LA64D-NEXT:  .LBB16_5: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB16_1 Depth=1
++; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:  .LBB16_6: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB16_1 Depth=1
++; LA64D-NEXT:    movgr2fr.w $fa0, $a3
++; LA64D-NEXT:    addi.w $a1, $a2, 0
++; LA64D-NEXT:    bne $a3, $a1, .LBB16_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    ret
++  %v = atomicrmw fadd ptr %p, float 1.0 acq_rel, align 4
++  ret float %v
++}
++
++define float @float_fsub_acq_rel(ptr %p) nounwind {
++; LA64F-LABEL: float_fsub_acq_rel:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    fld.s $fa0, $a0, 0
++; LA64F-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI17_0)
++; LA64F-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI17_0)
++; LA64F-NEXT:    fld.s $fa1, $a1, 0
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB17_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Loop Header: Depth=1
++; LA64F-NEXT:    # Child Loop BB17_3 Depth 2
++; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
++; LA64F-NEXT:    movfr2gr.s $a1, $fa2
++; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:  .LBB17_3: # %atomicrmw.start
++; LA64F-NEXT:    # Parent Loop BB17_1 Depth=1
++; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64F-NEXT:    ll.w $a3, $a0, 0
++; LA64F-NEXT:    bne $a3, $a2, .LBB17_5
++; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB17_3 Depth=2
++; LA64F-NEXT:    move $a4, $a1
++; LA64F-NEXT:    sc.w $a4, $a0, 0
++; LA64F-NEXT:    beqz $a4, .LBB17_3
++; LA64F-NEXT:    b .LBB17_6
++; LA64F-NEXT:  .LBB17_5: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB17_1 Depth=1
++; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:  .LBB17_6: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB17_1 Depth=1
++; LA64F-NEXT:    movgr2fr.w $fa0, $a3
++; LA64F-NEXT:    addi.w $a1, $a2, 0
++; LA64F-NEXT:    bne $a3, $a1, .LBB17_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: float_fsub_acq_rel:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    fld.s $fa0, $a0, 0
++; LA64D-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI17_0)
++; LA64D-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI17_0)
++; LA64D-NEXT:    fld.s $fa1, $a1, 0
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB17_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Loop Header: Depth=1
++; LA64D-NEXT:    # Child Loop BB17_3 Depth 2
++; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
++; LA64D-NEXT:    movfr2gr.s $a1, $fa2
++; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:  .LBB17_3: # %atomicrmw.start
++; LA64D-NEXT:    # Parent Loop BB17_1 Depth=1
++; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64D-NEXT:    ll.w $a3, $a0, 0
++; LA64D-NEXT:    bne $a3, $a2, .LBB17_5
++; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB17_3 Depth=2
++; LA64D-NEXT:    move $a4, $a1
++; LA64D-NEXT:    sc.w $a4, $a0, 0
++; LA64D-NEXT:    beqz $a4, .LBB17_3
++; LA64D-NEXT:    b .LBB17_6
++; LA64D-NEXT:  .LBB17_5: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB17_1 Depth=1
++; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:  .LBB17_6: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB17_1 Depth=1
++; LA64D-NEXT:    movgr2fr.w $fa0, $a3
++; LA64D-NEXT:    addi.w $a1, $a2, 0
++; LA64D-NEXT:    bne $a3, $a1, .LBB17_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    ret
++  %v = atomicrmw fsub ptr %p, float 1.0 acq_rel, align 4
++  ret float %v
++}
++
++define float @float_fmin_acq_rel(ptr %p) nounwind {
++; LA64F-LABEL: float_fmin_acq_rel:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    fld.s $fa0, $a0, 0
++; LA64F-NEXT:    addi.w $a1, $zero, 1
++; LA64F-NEXT:    movgr2fr.w $fa1, $a1
++; LA64F-NEXT:    ffint.s.w $fa1, $fa1
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB18_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Loop Header: Depth=1
++; LA64F-NEXT:    # Child Loop BB18_3 Depth 2
++; LA64F-NEXT:    fmax.s $fa2, $fa0, $fa0
++; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
++; LA64F-NEXT:    movfr2gr.s $a1, $fa2
++; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:  .LBB18_3: # %atomicrmw.start
++; LA64F-NEXT:    # Parent Loop BB18_1 Depth=1
++; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64F-NEXT:    ll.w $a3, $a0, 0
++; LA64F-NEXT:    bne $a3, $a2, .LBB18_5
++; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB18_3 Depth=2
++; LA64F-NEXT:    move $a4, $a1
++; LA64F-NEXT:    sc.w $a4, $a0, 0
++; LA64F-NEXT:    beqz $a4, .LBB18_3
++; LA64F-NEXT:    b .LBB18_6
++; LA64F-NEXT:  .LBB18_5: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB18_1 Depth=1
++; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:  .LBB18_6: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB18_1 Depth=1
++; LA64F-NEXT:    movgr2fr.w $fa0, $a3
++; LA64F-NEXT:    addi.w $a1, $a2, 0
++; LA64F-NEXT:    bne $a3, $a1, .LBB18_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: float_fmin_acq_rel:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    fld.s $fa0, $a0, 0
++; LA64D-NEXT:    addi.w $a1, $zero, 1
++; LA64D-NEXT:    movgr2fr.w $fa1, $a1
++; LA64D-NEXT:    ffint.s.w $fa1, $fa1
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB18_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Loop Header: Depth=1
++; LA64D-NEXT:    # Child Loop BB18_3 Depth 2
++; LA64D-NEXT:    fmax.s $fa2, $fa0, $fa0
++; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
++; LA64D-NEXT:    movfr2gr.s $a1, $fa2
++; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:  .LBB18_3: # %atomicrmw.start
++; LA64D-NEXT:    # Parent Loop BB18_1 Depth=1
++; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64D-NEXT:    ll.w $a3, $a0, 0
++; LA64D-NEXT:    bne $a3, $a2, .LBB18_5
++; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB18_3 Depth=2
++; LA64D-NEXT:    move $a4, $a1
++; LA64D-NEXT:    sc.w $a4, $a0, 0
++; LA64D-NEXT:    beqz $a4, .LBB18_3
++; LA64D-NEXT:    b .LBB18_6
++; LA64D-NEXT:  .LBB18_5: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB18_1 Depth=1
++; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:  .LBB18_6: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB18_1 Depth=1
++; LA64D-NEXT:    movgr2fr.w $fa0, $a3
++; LA64D-NEXT:    addi.w $a1, $a2, 0
++; LA64D-NEXT:    bne $a3, $a1, .LBB18_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    ret
++  %v = atomicrmw fmin ptr %p, float 1.0 acq_rel, align 4
++  ret float %v
++}
++
++define float @float_fmax_acq_rel(ptr %p) nounwind {
++; LA64F-LABEL: float_fmax_acq_rel:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    fld.s $fa0, $a0, 0
++; LA64F-NEXT:    addi.w $a1, $zero, 1
++; LA64F-NEXT:    movgr2fr.w $fa1, $a1
++; LA64F-NEXT:    ffint.s.w $fa1, $fa1
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB19_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Loop Header: Depth=1
++; LA64F-NEXT:    # Child Loop BB19_3 Depth 2
++; LA64F-NEXT:    fmax.s $fa2, $fa0, $fa0
++; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
++; LA64F-NEXT:    movfr2gr.s $a1, $fa2
++; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:  .LBB19_3: # %atomicrmw.start
++; LA64F-NEXT:    # Parent Loop BB19_1 Depth=1
++; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64F-NEXT:    ll.w $a3, $a0, 0
++; LA64F-NEXT:    bne $a3, $a2, .LBB19_5
++; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB19_3 Depth=2
++; LA64F-NEXT:    move $a4, $a1
++; LA64F-NEXT:    sc.w $a4, $a0, 0
++; LA64F-NEXT:    beqz $a4, .LBB19_3
++; LA64F-NEXT:    b .LBB19_6
++; LA64F-NEXT:  .LBB19_5: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB19_1 Depth=1
++; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:  .LBB19_6: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB19_1 Depth=1
++; LA64F-NEXT:    movgr2fr.w $fa0, $a3
++; LA64F-NEXT:    addi.w $a1, $a2, 0
++; LA64F-NEXT:    bne $a3, $a1, .LBB19_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: float_fmax_acq_rel:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    fld.s $fa0, $a0, 0
++; LA64D-NEXT:    addi.w $a1, $zero, 1
++; LA64D-NEXT:    movgr2fr.w $fa1, $a1
++; LA64D-NEXT:    ffint.s.w $fa1, $fa1
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB19_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Loop Header: Depth=1
++; LA64D-NEXT:    # Child Loop BB19_3 Depth 2
++; LA64D-NEXT:    fmax.s $fa2, $fa0, $fa0
++; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
++; LA64D-NEXT:    movfr2gr.s $a1, $fa2
++; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:  .LBB19_3: # %atomicrmw.start
++; LA64D-NEXT:    # Parent Loop BB19_1 Depth=1
++; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64D-NEXT:    ll.w $a3, $a0, 0
++; LA64D-NEXT:    bne $a3, $a2, .LBB19_5
++; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB19_3 Depth=2
++; LA64D-NEXT:    move $a4, $a1
++; LA64D-NEXT:    sc.w $a4, $a0, 0
++; LA64D-NEXT:    beqz $a4, .LBB19_3
++; LA64D-NEXT:    b .LBB19_6
++; LA64D-NEXT:  .LBB19_5: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB19_1 Depth=1
++; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:  .LBB19_6: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB19_1 Depth=1
++; LA64D-NEXT:    movgr2fr.w $fa0, $a3
++; LA64D-NEXT:    addi.w $a1, $a2, 0
++; LA64D-NEXT:    bne $a3, $a1, .LBB19_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    ret
++  %v = atomicrmw fmax ptr %p, float 1.0 acq_rel, align 4
++  ret float %v
++}
++
++define double @double_fadd_acq_rel(ptr %p) nounwind {
++; LA64F-LABEL: double_fadd_acq_rel:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    addi.d $sp, $sp, -80
++; LA64F-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s5, $sp, 16 # 8-byte Folded Spill
++; LA64F-NEXT:    move $fp, $a0
++; LA64F-NEXT:    ld.d $a0, $a0, 0
++; LA64F-NEXT:    lu52i.d $s0, $zero, 1023
++; LA64F-NEXT:    ori $s1, $zero, 8
++; LA64F-NEXT:    addi.d $s2, $sp, 8
++; LA64F-NEXT:    addi.d $s3, $sp, 0
++; LA64F-NEXT:    ori $s4, $zero, 4
++; LA64F-NEXT:    ori $s5, $zero, 2
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB20_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64F-NEXT:    st.d $a0, $sp, 8
++; LA64F-NEXT:    move $a1, $s0
++; LA64F-NEXT:    bl %plt(__adddf3)
++; LA64F-NEXT:    st.d $a0, $sp, 0
++; LA64F-NEXT:    move $a0, $s1
++; LA64F-NEXT:    move $a1, $fp
++; LA64F-NEXT:    move $a2, $s2
++; LA64F-NEXT:    move $a3, $s3
++; LA64F-NEXT:    move $a4, $s4
++; LA64F-NEXT:    move $a5, $s5
++; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64F-NEXT:    move $a1, $a0
++; LA64F-NEXT:    ld.d $a0, $sp, 8
++; LA64F-NEXT:    beqz $a1, .LBB20_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ld.d $s5, $sp, 16 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64F-NEXT:    addi.d $sp, $sp, 80
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: double_fadd_acq_rel:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    addi.d $sp, $sp, -80
++; LA64D-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
++; LA64D-NEXT:    fst.d $fs0, $sp, 16 # 8-byte Folded Spill
++; LA64D-NEXT:    move $fp, $a0
++; LA64D-NEXT:    fld.d $fa0, $a0, 0
++; LA64D-NEXT:    addi.d $a0, $zero, 1
++; LA64D-NEXT:    movgr2fr.d $fa1, $a0
++; LA64D-NEXT:    ffint.d.l $fs0, $fa1
++; LA64D-NEXT:    ori $s0, $zero, 8
++; LA64D-NEXT:    addi.d $s1, $sp, 8
++; LA64D-NEXT:    addi.d $s2, $sp, 0
++; LA64D-NEXT:    ori $s3, $zero, 4
++; LA64D-NEXT:    ori $s4, $zero, 2
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB20_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64D-NEXT:    fst.d $fa0, $sp, 8
++; LA64D-NEXT:    fadd.d $fa0, $fa0, $fs0
++; LA64D-NEXT:    fst.d $fa0, $sp, 0
++; LA64D-NEXT:    move $a0, $s0
++; LA64D-NEXT:    move $a1, $fp
++; LA64D-NEXT:    move $a2, $s1
++; LA64D-NEXT:    move $a3, $s2
++; LA64D-NEXT:    move $a4, $s3
++; LA64D-NEXT:    move $a5, $s4
++; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64D-NEXT:    fld.d $fa0, $sp, 8
++; LA64D-NEXT:    beqz $a0, .LBB20_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    fld.d $fs0, $sp, 16 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64D-NEXT:    addi.d $sp, $sp, 80
++; LA64D-NEXT:    ret
++  %v = atomicrmw fadd ptr %p, double 1.0 acq_rel, align 4
++  ret double %v
++}
++
++define double @double_fsub_acq_rel(ptr %p) nounwind {
++; LA64F-LABEL: double_fsub_acq_rel:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    addi.d $sp, $sp, -80
++; LA64F-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s5, $sp, 16 # 8-byte Folded Spill
++; LA64F-NEXT:    move $fp, $a0
++; LA64F-NEXT:    ld.d $a0, $a0, 0
++; LA64F-NEXT:    lu52i.d $s0, $zero, -1025
++; LA64F-NEXT:    ori $s1, $zero, 8
++; LA64F-NEXT:    addi.d $s2, $sp, 8
++; LA64F-NEXT:    addi.d $s3, $sp, 0
++; LA64F-NEXT:    ori $s4, $zero, 4
++; LA64F-NEXT:    ori $s5, $zero, 2
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB21_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64F-NEXT:    st.d $a0, $sp, 8
++; LA64F-NEXT:    move $a1, $s0
++; LA64F-NEXT:    bl %plt(__adddf3)
++; LA64F-NEXT:    st.d $a0, $sp, 0
++; LA64F-NEXT:    move $a0, $s1
++; LA64F-NEXT:    move $a1, $fp
++; LA64F-NEXT:    move $a2, $s2
++; LA64F-NEXT:    move $a3, $s3
++; LA64F-NEXT:    move $a4, $s4
++; LA64F-NEXT:    move $a5, $s5
++; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64F-NEXT:    move $a1, $a0
++; LA64F-NEXT:    ld.d $a0, $sp, 8
++; LA64F-NEXT:    beqz $a1, .LBB21_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ld.d $s5, $sp, 16 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64F-NEXT:    addi.d $sp, $sp, 80
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: double_fsub_acq_rel:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    addi.d $sp, $sp, -80
++; LA64D-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
++; LA64D-NEXT:    fst.d $fs0, $sp, 16 # 8-byte Folded Spill
++; LA64D-NEXT:    move $fp, $a0
++; LA64D-NEXT:    fld.d $fa0, $a0, 0
++; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI21_0)
++; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI21_0)
++; LA64D-NEXT:    fld.d $fs0, $a0, 0
++; LA64D-NEXT:    ori $s0, $zero, 8
++; LA64D-NEXT:    addi.d $s1, $sp, 8
++; LA64D-NEXT:    addi.d $s2, $sp, 0
++; LA64D-NEXT:    ori $s3, $zero, 4
++; LA64D-NEXT:    ori $s4, $zero, 2
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB21_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64D-NEXT:    fst.d $fa0, $sp, 8
++; LA64D-NEXT:    fadd.d $fa0, $fa0, $fs0
++; LA64D-NEXT:    fst.d $fa0, $sp, 0
++; LA64D-NEXT:    move $a0, $s0
++; LA64D-NEXT:    move $a1, $fp
++; LA64D-NEXT:    move $a2, $s1
++; LA64D-NEXT:    move $a3, $s2
++; LA64D-NEXT:    move $a4, $s3
++; LA64D-NEXT:    move $a5, $s4
++; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64D-NEXT:    fld.d $fa0, $sp, 8
++; LA64D-NEXT:    beqz $a0, .LBB21_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    fld.d $fs0, $sp, 16 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64D-NEXT:    addi.d $sp, $sp, 80
++; LA64D-NEXT:    ret
++  %v = atomicrmw fsub ptr %p, double 1.0 acq_rel, align 4
++  ret double %v
++}
++
++define double @double_fmin_acq_rel(ptr %p) nounwind {
++; LA64F-LABEL: double_fmin_acq_rel:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    addi.d $sp, $sp, -80
++; LA64F-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s5, $sp, 16 # 8-byte Folded Spill
++; LA64F-NEXT:    move $fp, $a0
++; LA64F-NEXT:    ld.d $a0, $a0, 0
++; LA64F-NEXT:    lu52i.d $s0, $zero, 1023
++; LA64F-NEXT:    ori $s1, $zero, 8
++; LA64F-NEXT:    addi.d $s2, $sp, 8
++; LA64F-NEXT:    addi.d $s3, $sp, 0
++; LA64F-NEXT:    ori $s4, $zero, 4
++; LA64F-NEXT:    ori $s5, $zero, 2
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB22_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64F-NEXT:    st.d $a0, $sp, 8
++; LA64F-NEXT:    move $a1, $s0
++; LA64F-NEXT:    bl %plt(fmin)
++; LA64F-NEXT:    st.d $a0, $sp, 0
++; LA64F-NEXT:    move $a0, $s1
++; LA64F-NEXT:    move $a1, $fp
++; LA64F-NEXT:    move $a2, $s2
++; LA64F-NEXT:    move $a3, $s3
++; LA64F-NEXT:    move $a4, $s4
++; LA64F-NEXT:    move $a5, $s5
++; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64F-NEXT:    move $a1, $a0
++; LA64F-NEXT:    ld.d $a0, $sp, 8
++; LA64F-NEXT:    beqz $a1, .LBB22_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ld.d $s5, $sp, 16 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64F-NEXT:    addi.d $sp, $sp, 80
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: double_fmin_acq_rel:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    addi.d $sp, $sp, -80
++; LA64D-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
++; LA64D-NEXT:    fst.d $fs0, $sp, 16 # 8-byte Folded Spill
++; LA64D-NEXT:    move $fp, $a0
++; LA64D-NEXT:    fld.d $fa0, $a0, 0
++; LA64D-NEXT:    addi.d $a0, $zero, 1
++; LA64D-NEXT:    movgr2fr.d $fa1, $a0
++; LA64D-NEXT:    ffint.d.l $fs0, $fa1
++; LA64D-NEXT:    ori $s0, $zero, 8
++; LA64D-NEXT:    addi.d $s1, $sp, 8
++; LA64D-NEXT:    addi.d $s2, $sp, 0
++; LA64D-NEXT:    ori $s3, $zero, 4
++; LA64D-NEXT:    ori $s4, $zero, 2
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB22_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64D-NEXT:    fst.d $fa0, $sp, 8
++; LA64D-NEXT:    fmax.d $fa0, $fa0, $fa0
++; LA64D-NEXT:    fmin.d $fa0, $fa0, $fs0
++; LA64D-NEXT:    fst.d $fa0, $sp, 0
++; LA64D-NEXT:    move $a0, $s0
++; LA64D-NEXT:    move $a1, $fp
++; LA64D-NEXT:    move $a2, $s1
++; LA64D-NEXT:    move $a3, $s2
++; LA64D-NEXT:    move $a4, $s3
++; LA64D-NEXT:    move $a5, $s4
++; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64D-NEXT:    fld.d $fa0, $sp, 8
++; LA64D-NEXT:    beqz $a0, .LBB22_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    fld.d $fs0, $sp, 16 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64D-NEXT:    addi.d $sp, $sp, 80
++; LA64D-NEXT:    ret
++  %v = atomicrmw fmin ptr %p, double 1.0 acq_rel, align 4
++  ret double %v
++}
++
++define double @double_fmax_acq_rel(ptr %p) nounwind {
++; LA64F-LABEL: double_fmax_acq_rel:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    addi.d $sp, $sp, -80
++; LA64F-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s5, $sp, 16 # 8-byte Folded Spill
++; LA64F-NEXT:    move $fp, $a0
++; LA64F-NEXT:    ld.d $a0, $a0, 0
++; LA64F-NEXT:    lu52i.d $s0, $zero, 1023
++; LA64F-NEXT:    ori $s1, $zero, 8
++; LA64F-NEXT:    addi.d $s2, $sp, 8
++; LA64F-NEXT:    addi.d $s3, $sp, 0
++; LA64F-NEXT:    ori $s4, $zero, 4
++; LA64F-NEXT:    ori $s5, $zero, 2
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB23_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64F-NEXT:    st.d $a0, $sp, 8
++; LA64F-NEXT:    move $a1, $s0
++; LA64F-NEXT:    bl %plt(fmax)
++; LA64F-NEXT:    st.d $a0, $sp, 0
++; LA64F-NEXT:    move $a0, $s1
++; LA64F-NEXT:    move $a1, $fp
++; LA64F-NEXT:    move $a2, $s2
++; LA64F-NEXT:    move $a3, $s3
++; LA64F-NEXT:    move $a4, $s4
++; LA64F-NEXT:    move $a5, $s5
++; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64F-NEXT:    move $a1, $a0
++; LA64F-NEXT:    ld.d $a0, $sp, 8
++; LA64F-NEXT:    beqz $a1, .LBB23_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ld.d $s5, $sp, 16 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64F-NEXT:    addi.d $sp, $sp, 80
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: double_fmax_acq_rel:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    addi.d $sp, $sp, -80
++; LA64D-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
++; LA64D-NEXT:    fst.d $fs0, $sp, 16 # 8-byte Folded Spill
++; LA64D-NEXT:    move $fp, $a0
++; LA64D-NEXT:    fld.d $fa0, $a0, 0
++; LA64D-NEXT:    addi.d $a0, $zero, 1
++; LA64D-NEXT:    movgr2fr.d $fa1, $a0
++; LA64D-NEXT:    ffint.d.l $fs0, $fa1
++; LA64D-NEXT:    ori $s0, $zero, 8
++; LA64D-NEXT:    addi.d $s1, $sp, 8
++; LA64D-NEXT:    addi.d $s2, $sp, 0
++; LA64D-NEXT:    ori $s3, $zero, 4
++; LA64D-NEXT:    ori $s4, $zero, 2
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB23_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64D-NEXT:    fst.d $fa0, $sp, 8
++; LA64D-NEXT:    fmax.d $fa0, $fa0, $fa0
++; LA64D-NEXT:    fmax.d $fa0, $fa0, $fs0
++; LA64D-NEXT:    fst.d $fa0, $sp, 0
++; LA64D-NEXT:    move $a0, $s0
++; LA64D-NEXT:    move $a1, $fp
++; LA64D-NEXT:    move $a2, $s1
++; LA64D-NEXT:    move $a3, $s2
++; LA64D-NEXT:    move $a4, $s3
++; LA64D-NEXT:    move $a5, $s4
++; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64D-NEXT:    fld.d $fa0, $sp, 8
++; LA64D-NEXT:    beqz $a0, .LBB23_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    fld.d $fs0, $sp, 16 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64D-NEXT:    addi.d $sp, $sp, 80
++; LA64D-NEXT:    ret
++  %v = atomicrmw fmax ptr %p, double 1.0 acq_rel, align 4
++  ret double %v
++}
++
++define float @float_fadd_seq_cst(ptr %p) nounwind {
++; LA64F-LABEL: float_fadd_seq_cst:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    fld.s $fa0, $a0, 0
++; LA64F-NEXT:    addi.w $a1, $zero, 1
++; LA64F-NEXT:    movgr2fr.w $fa1, $a1
++; LA64F-NEXT:    ffint.s.w $fa1, $fa1
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB24_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Loop Header: Depth=1
++; LA64F-NEXT:    # Child Loop BB24_3 Depth 2
++; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
++; LA64F-NEXT:    movfr2gr.s $a1, $fa2
++; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:  .LBB24_3: # %atomicrmw.start
++; LA64F-NEXT:    # Parent Loop BB24_1 Depth=1
++; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64F-NEXT:    ll.w $a3, $a0, 0
++; LA64F-NEXT:    bne $a3, $a2, .LBB24_5
++; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB24_3 Depth=2
++; LA64F-NEXT:    move $a4, $a1
++; LA64F-NEXT:    sc.w $a4, $a0, 0
++; LA64F-NEXT:    beqz $a4, .LBB24_3
++; LA64F-NEXT:    b .LBB24_6
++; LA64F-NEXT:  .LBB24_5: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB24_1 Depth=1
++; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:  .LBB24_6: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB24_1 Depth=1
++; LA64F-NEXT:    movgr2fr.w $fa0, $a3
++; LA64F-NEXT:    addi.w $a1, $a2, 0
++; LA64F-NEXT:    bne $a3, $a1, .LBB24_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: float_fadd_seq_cst:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    fld.s $fa0, $a0, 0
++; LA64D-NEXT:    addi.w $a1, $zero, 1
++; LA64D-NEXT:    movgr2fr.w $fa1, $a1
++; LA64D-NEXT:    ffint.s.w $fa1, $fa1
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB24_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Loop Header: Depth=1
++; LA64D-NEXT:    # Child Loop BB24_3 Depth 2
++; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
++; LA64D-NEXT:    movfr2gr.s $a1, $fa2
++; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:  .LBB24_3: # %atomicrmw.start
++; LA64D-NEXT:    # Parent Loop BB24_1 Depth=1
++; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64D-NEXT:    ll.w $a3, $a0, 0
++; LA64D-NEXT:    bne $a3, $a2, .LBB24_5
++; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB24_3 Depth=2
++; LA64D-NEXT:    move $a4, $a1
++; LA64D-NEXT:    sc.w $a4, $a0, 0
++; LA64D-NEXT:    beqz $a4, .LBB24_3
++; LA64D-NEXT:    b .LBB24_6
++; LA64D-NEXT:  .LBB24_5: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB24_1 Depth=1
++; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:  .LBB24_6: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB24_1 Depth=1
++; LA64D-NEXT:    movgr2fr.w $fa0, $a3
++; LA64D-NEXT:    addi.w $a1, $a2, 0
++; LA64D-NEXT:    bne $a3, $a1, .LBB24_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    ret
++  %v = atomicrmw fadd ptr %p, float 1.0 seq_cst, align 4
++  ret float %v
++}
++
++define float @float_fsub_seq_cst(ptr %p) nounwind {
++; LA64F-LABEL: float_fsub_seq_cst:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    fld.s $fa0, $a0, 0
++; LA64F-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI25_0)
++; LA64F-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI25_0)
++; LA64F-NEXT:    fld.s $fa1, $a1, 0
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB25_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Loop Header: Depth=1
++; LA64F-NEXT:    # Child Loop BB25_3 Depth 2
++; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
++; LA64F-NEXT:    movfr2gr.s $a1, $fa2
++; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:  .LBB25_3: # %atomicrmw.start
++; LA64F-NEXT:    # Parent Loop BB25_1 Depth=1
++; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64F-NEXT:    ll.w $a3, $a0, 0
++; LA64F-NEXT:    bne $a3, $a2, .LBB25_5
++; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB25_3 Depth=2
++; LA64F-NEXT:    move $a4, $a1
++; LA64F-NEXT:    sc.w $a4, $a0, 0
++; LA64F-NEXT:    beqz $a4, .LBB25_3
++; LA64F-NEXT:    b .LBB25_6
++; LA64F-NEXT:  .LBB25_5: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB25_1 Depth=1
++; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:  .LBB25_6: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB25_1 Depth=1
++; LA64F-NEXT:    movgr2fr.w $fa0, $a3
++; LA64F-NEXT:    addi.w $a1, $a2, 0
++; LA64F-NEXT:    bne $a3, $a1, .LBB25_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: float_fsub_seq_cst:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    fld.s $fa0, $a0, 0
++; LA64D-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI25_0)
++; LA64D-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI25_0)
++; LA64D-NEXT:    fld.s $fa1, $a1, 0
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB25_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Loop Header: Depth=1
++; LA64D-NEXT:    # Child Loop BB25_3 Depth 2
++; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
++; LA64D-NEXT:    movfr2gr.s $a1, $fa2
++; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:  .LBB25_3: # %atomicrmw.start
++; LA64D-NEXT:    # Parent Loop BB25_1 Depth=1
++; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64D-NEXT:    ll.w $a3, $a0, 0
++; LA64D-NEXT:    bne $a3, $a2, .LBB25_5
++; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB25_3 Depth=2
++; LA64D-NEXT:    move $a4, $a1
++; LA64D-NEXT:    sc.w $a4, $a0, 0
++; LA64D-NEXT:    beqz $a4, .LBB25_3
++; LA64D-NEXT:    b .LBB25_6
++; LA64D-NEXT:  .LBB25_5: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB25_1 Depth=1
++; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:  .LBB25_6: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB25_1 Depth=1
++; LA64D-NEXT:    movgr2fr.w $fa0, $a3
++; LA64D-NEXT:    addi.w $a1, $a2, 0
++; LA64D-NEXT:    bne $a3, $a1, .LBB25_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    ret
++  %v = atomicrmw fsub ptr %p, float 1.0 seq_cst, align 4
++  ret float %v
++}
++
++define float @float_fmin_seq_cst(ptr %p) nounwind {
++; LA64F-LABEL: float_fmin_seq_cst:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    fld.s $fa0, $a0, 0
++; LA64F-NEXT:    addi.w $a1, $zero, 1
++; LA64F-NEXT:    movgr2fr.w $fa1, $a1
++; LA64F-NEXT:    ffint.s.w $fa1, $fa1
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB26_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Loop Header: Depth=1
++; LA64F-NEXT:    # Child Loop BB26_3 Depth 2
++; LA64F-NEXT:    fmax.s $fa2, $fa0, $fa0
++; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
++; LA64F-NEXT:    movfr2gr.s $a1, $fa2
++; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:  .LBB26_3: # %atomicrmw.start
++; LA64F-NEXT:    # Parent Loop BB26_1 Depth=1
++; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64F-NEXT:    ll.w $a3, $a0, 0
++; LA64F-NEXT:    bne $a3, $a2, .LBB26_5
++; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB26_3 Depth=2
++; LA64F-NEXT:    move $a4, $a1
++; LA64F-NEXT:    sc.w $a4, $a0, 0
++; LA64F-NEXT:    beqz $a4, .LBB26_3
++; LA64F-NEXT:    b .LBB26_6
++; LA64F-NEXT:  .LBB26_5: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB26_1 Depth=1
++; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:  .LBB26_6: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB26_1 Depth=1
++; LA64F-NEXT:    movgr2fr.w $fa0, $a3
++; LA64F-NEXT:    addi.w $a1, $a2, 0
++; LA64F-NEXT:    bne $a3, $a1, .LBB26_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: float_fmin_seq_cst:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    fld.s $fa0, $a0, 0
++; LA64D-NEXT:    addi.w $a1, $zero, 1
++; LA64D-NEXT:    movgr2fr.w $fa1, $a1
++; LA64D-NEXT:    ffint.s.w $fa1, $fa1
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB26_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Loop Header: Depth=1
++; LA64D-NEXT:    # Child Loop BB26_3 Depth 2
++; LA64D-NEXT:    fmax.s $fa2, $fa0, $fa0
++; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
++; LA64D-NEXT:    movfr2gr.s $a1, $fa2
++; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:  .LBB26_3: # %atomicrmw.start
++; LA64D-NEXT:    # Parent Loop BB26_1 Depth=1
++; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64D-NEXT:    ll.w $a3, $a0, 0
++; LA64D-NEXT:    bne $a3, $a2, .LBB26_5
++; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB26_3 Depth=2
++; LA64D-NEXT:    move $a4, $a1
++; LA64D-NEXT:    sc.w $a4, $a0, 0
++; LA64D-NEXT:    beqz $a4, .LBB26_3
++; LA64D-NEXT:    b .LBB26_6
++; LA64D-NEXT:  .LBB26_5: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB26_1 Depth=1
++; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:  .LBB26_6: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB26_1 Depth=1
++; LA64D-NEXT:    movgr2fr.w $fa0, $a3
++; LA64D-NEXT:    addi.w $a1, $a2, 0
++; LA64D-NEXT:    bne $a3, $a1, .LBB26_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    ret
++  %v = atomicrmw fmin ptr %p, float 1.0 seq_cst, align 4
++  ret float %v
++}
++
++define float @float_fmax_seq_cst(ptr %p) nounwind {
++; LA64F-LABEL: float_fmax_seq_cst:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    fld.s $fa0, $a0, 0
++; LA64F-NEXT:    addi.w $a1, $zero, 1
++; LA64F-NEXT:    movgr2fr.w $fa1, $a1
++; LA64F-NEXT:    ffint.s.w $fa1, $fa1
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB27_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Loop Header: Depth=1
++; LA64F-NEXT:    # Child Loop BB27_3 Depth 2
++; LA64F-NEXT:    fmax.s $fa2, $fa0, $fa0
++; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
++; LA64F-NEXT:    movfr2gr.s $a1, $fa2
++; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:  .LBB27_3: # %atomicrmw.start
++; LA64F-NEXT:    # Parent Loop BB27_1 Depth=1
++; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64F-NEXT:    ll.w $a3, $a0, 0
++; LA64F-NEXT:    bne $a3, $a2, .LBB27_5
++; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB27_3 Depth=2
++; LA64F-NEXT:    move $a4, $a1
++; LA64F-NEXT:    sc.w $a4, $a0, 0
++; LA64F-NEXT:    beqz $a4, .LBB27_3
++; LA64F-NEXT:    b .LBB27_6
++; LA64F-NEXT:  .LBB27_5: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB27_1 Depth=1
++; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:  .LBB27_6: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB27_1 Depth=1
++; LA64F-NEXT:    movgr2fr.w $fa0, $a3
++; LA64F-NEXT:    addi.w $a1, $a2, 0
++; LA64F-NEXT:    bne $a3, $a1, .LBB27_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: float_fmax_seq_cst:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    fld.s $fa0, $a0, 0
++; LA64D-NEXT:    addi.w $a1, $zero, 1
++; LA64D-NEXT:    movgr2fr.w $fa1, $a1
++; LA64D-NEXT:    ffint.s.w $fa1, $fa1
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB27_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Loop Header: Depth=1
++; LA64D-NEXT:    # Child Loop BB27_3 Depth 2
++; LA64D-NEXT:    fmax.s $fa2, $fa0, $fa0
++; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
++; LA64D-NEXT:    movfr2gr.s $a1, $fa2
++; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:  .LBB27_3: # %atomicrmw.start
++; LA64D-NEXT:    # Parent Loop BB27_1 Depth=1
++; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64D-NEXT:    ll.w $a3, $a0, 0
++; LA64D-NEXT:    bne $a3, $a2, .LBB27_5
++; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB27_3 Depth=2
++; LA64D-NEXT:    move $a4, $a1
++; LA64D-NEXT:    sc.w $a4, $a0, 0
++; LA64D-NEXT:    beqz $a4, .LBB27_3
++; LA64D-NEXT:    b .LBB27_6
++; LA64D-NEXT:  .LBB27_5: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB27_1 Depth=1
++; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:  .LBB27_6: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB27_1 Depth=1
++; LA64D-NEXT:    movgr2fr.w $fa0, $a3
++; LA64D-NEXT:    addi.w $a1, $a2, 0
++; LA64D-NEXT:    bne $a3, $a1, .LBB27_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    ret
++  %v = atomicrmw fmax ptr %p, float 1.0 seq_cst, align 4
++  ret float %v
++}
++
++define double @double_fadd_seq_cst(ptr %p) nounwind {
++; LA64F-LABEL: double_fadd_seq_cst:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    addi.d $sp, $sp, -80
++; LA64F-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
++; LA64F-NEXT:    move $fp, $a0
++; LA64F-NEXT:    ld.d $a0, $a0, 0
++; LA64F-NEXT:    lu52i.d $s0, $zero, 1023
++; LA64F-NEXT:    ori $s1, $zero, 8
++; LA64F-NEXT:    addi.d $s2, $sp, 16
++; LA64F-NEXT:    addi.d $s3, $sp, 8
++; LA64F-NEXT:    ori $s4, $zero, 5
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB28_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64F-NEXT:    st.d $a0, $sp, 16
++; LA64F-NEXT:    move $a1, $s0
++; LA64F-NEXT:    bl %plt(__adddf3)
++; LA64F-NEXT:    st.d $a0, $sp, 8
++; LA64F-NEXT:    move $a0, $s1
++; LA64F-NEXT:    move $a1, $fp
++; LA64F-NEXT:    move $a2, $s2
++; LA64F-NEXT:    move $a3, $s3
++; LA64F-NEXT:    move $a4, $s4
++; LA64F-NEXT:    move $a5, $s4
++; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64F-NEXT:    move $a1, $a0
++; LA64F-NEXT:    ld.d $a0, $sp, 16
++; LA64F-NEXT:    beqz $a1, .LBB28_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64F-NEXT:    addi.d $sp, $sp, 80
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: double_fadd_seq_cst:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    addi.d $sp, $sp, -80
++; LA64D-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64D-NEXT:    fst.d $fs0, $sp, 24 # 8-byte Folded Spill
++; LA64D-NEXT:    move $fp, $a0
++; LA64D-NEXT:    fld.d $fa0, $a0, 0
++; LA64D-NEXT:    addi.d $a0, $zero, 1
++; LA64D-NEXT:    movgr2fr.d $fa1, $a0
++; LA64D-NEXT:    ffint.d.l $fs0, $fa1
++; LA64D-NEXT:    ori $s0, $zero, 8
++; LA64D-NEXT:    addi.d $s1, $sp, 16
++; LA64D-NEXT:    addi.d $s2, $sp, 8
++; LA64D-NEXT:    ori $s3, $zero, 5
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB28_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64D-NEXT:    fst.d $fa0, $sp, 16
++; LA64D-NEXT:    fadd.d $fa0, $fa0, $fs0
++; LA64D-NEXT:    fst.d $fa0, $sp, 8
++; LA64D-NEXT:    move $a0, $s0
++; LA64D-NEXT:    move $a1, $fp
++; LA64D-NEXT:    move $a2, $s1
++; LA64D-NEXT:    move $a3, $s2
++; LA64D-NEXT:    move $a4, $s3
++; LA64D-NEXT:    move $a5, $s3
++; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64D-NEXT:    fld.d $fa0, $sp, 16
++; LA64D-NEXT:    beqz $a0, .LBB28_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    fld.d $fs0, $sp, 24 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64D-NEXT:    addi.d $sp, $sp, 80
++; LA64D-NEXT:    ret
++  %v = atomicrmw fadd ptr %p, double 1.0 seq_cst, align 4
++  ret double %v
++}
++
++define double @double_fsub_seq_cst(ptr %p) nounwind {
++; LA64F-LABEL: double_fsub_seq_cst:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    addi.d $sp, $sp, -80
++; LA64F-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
++; LA64F-NEXT:    move $fp, $a0
++; LA64F-NEXT:    ld.d $a0, $a0, 0
++; LA64F-NEXT:    lu52i.d $s0, $zero, -1025
++; LA64F-NEXT:    ori $s1, $zero, 8
++; LA64F-NEXT:    addi.d $s2, $sp, 16
++; LA64F-NEXT:    addi.d $s3, $sp, 8
++; LA64F-NEXT:    ori $s4, $zero, 5
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB29_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64F-NEXT:    st.d $a0, $sp, 16
++; LA64F-NEXT:    move $a1, $s0
++; LA64F-NEXT:    bl %plt(__adddf3)
++; LA64F-NEXT:    st.d $a0, $sp, 8
++; LA64F-NEXT:    move $a0, $s1
++; LA64F-NEXT:    move $a1, $fp
++; LA64F-NEXT:    move $a2, $s2
++; LA64F-NEXT:    move $a3, $s3
++; LA64F-NEXT:    move $a4, $s4
++; LA64F-NEXT:    move $a5, $s4
++; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64F-NEXT:    move $a1, $a0
++; LA64F-NEXT:    ld.d $a0, $sp, 16
++; LA64F-NEXT:    beqz $a1, .LBB29_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64F-NEXT:    addi.d $sp, $sp, 80
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: double_fsub_seq_cst:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    addi.d $sp, $sp, -80
++; LA64D-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64D-NEXT:    fst.d $fs0, $sp, 24 # 8-byte Folded Spill
++; LA64D-NEXT:    move $fp, $a0
++; LA64D-NEXT:    fld.d $fa0, $a0, 0
++; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI29_0)
++; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI29_0)
++; LA64D-NEXT:    fld.d $fs0, $a0, 0
++; LA64D-NEXT:    ori $s0, $zero, 8
++; LA64D-NEXT:    addi.d $s1, $sp, 16
++; LA64D-NEXT:    addi.d $s2, $sp, 8
++; LA64D-NEXT:    ori $s3, $zero, 5
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB29_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64D-NEXT:    fst.d $fa0, $sp, 16
++; LA64D-NEXT:    fadd.d $fa0, $fa0, $fs0
++; LA64D-NEXT:    fst.d $fa0, $sp, 8
++; LA64D-NEXT:    move $a0, $s0
++; LA64D-NEXT:    move $a1, $fp
++; LA64D-NEXT:    move $a2, $s1
++; LA64D-NEXT:    move $a3, $s2
++; LA64D-NEXT:    move $a4, $s3
++; LA64D-NEXT:    move $a5, $s3
++; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64D-NEXT:    fld.d $fa0, $sp, 16
++; LA64D-NEXT:    beqz $a0, .LBB29_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    fld.d $fs0, $sp, 24 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64D-NEXT:    addi.d $sp, $sp, 80
++; LA64D-NEXT:    ret
++  %v = atomicrmw fsub ptr %p, double 1.0 seq_cst, align 4
++  ret double %v
++}
++
++define double @double_fmin_seq_cst(ptr %p) nounwind {
++; LA64F-LABEL: double_fmin_seq_cst:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    addi.d $sp, $sp, -80
++; LA64F-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
++; LA64F-NEXT:    move $fp, $a0
++; LA64F-NEXT:    ld.d $a0, $a0, 0
++; LA64F-NEXT:    lu52i.d $s0, $zero, 1023
++; LA64F-NEXT:    ori $s1, $zero, 8
++; LA64F-NEXT:    addi.d $s2, $sp, 16
++; LA64F-NEXT:    addi.d $s3, $sp, 8
++; LA64F-NEXT:    ori $s4, $zero, 5
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB30_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64F-NEXT:    st.d $a0, $sp, 16
++; LA64F-NEXT:    move $a1, $s0
++; LA64F-NEXT:    bl %plt(fmin)
++; LA64F-NEXT:    st.d $a0, $sp, 8
++; LA64F-NEXT:    move $a0, $s1
++; LA64F-NEXT:    move $a1, $fp
++; LA64F-NEXT:    move $a2, $s2
++; LA64F-NEXT:    move $a3, $s3
++; LA64F-NEXT:    move $a4, $s4
++; LA64F-NEXT:    move $a5, $s4
++; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64F-NEXT:    move $a1, $a0
++; LA64F-NEXT:    ld.d $a0, $sp, 16
++; LA64F-NEXT:    beqz $a1, .LBB30_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64F-NEXT:    addi.d $sp, $sp, 80
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: double_fmin_seq_cst:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    addi.d $sp, $sp, -80
++; LA64D-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64D-NEXT:    fst.d $fs0, $sp, 24 # 8-byte Folded Spill
++; LA64D-NEXT:    move $fp, $a0
++; LA64D-NEXT:    fld.d $fa0, $a0, 0
++; LA64D-NEXT:    addi.d $a0, $zero, 1
++; LA64D-NEXT:    movgr2fr.d $fa1, $a0
++; LA64D-NEXT:    ffint.d.l $fs0, $fa1
++; LA64D-NEXT:    ori $s0, $zero, 8
++; LA64D-NEXT:    addi.d $s1, $sp, 16
++; LA64D-NEXT:    addi.d $s2, $sp, 8
++; LA64D-NEXT:    ori $s3, $zero, 5
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB30_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64D-NEXT:    fst.d $fa0, $sp, 16
++; LA64D-NEXT:    fmax.d $fa0, $fa0, $fa0
++; LA64D-NEXT:    fmin.d $fa0, $fa0, $fs0
++; LA64D-NEXT:    fst.d $fa0, $sp, 8
++; LA64D-NEXT:    move $a0, $s0
++; LA64D-NEXT:    move $a1, $fp
++; LA64D-NEXT:    move $a2, $s1
++; LA64D-NEXT:    move $a3, $s2
++; LA64D-NEXT:    move $a4, $s3
++; LA64D-NEXT:    move $a5, $s3
++; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64D-NEXT:    fld.d $fa0, $sp, 16
++; LA64D-NEXT:    beqz $a0, .LBB30_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    fld.d $fs0, $sp, 24 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64D-NEXT:    addi.d $sp, $sp, 80
++; LA64D-NEXT:    ret
++  %v = atomicrmw fmin ptr %p, double 1.0 seq_cst, align 4
++  ret double %v
++}
++
++define double @double_fmax_seq_cst(ptr %p) nounwind {
++; LA64F-LABEL: double_fmax_seq_cst:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    addi.d $sp, $sp, -80
++; LA64F-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
++; LA64F-NEXT:    move $fp, $a0
++; LA64F-NEXT:    ld.d $a0, $a0, 0
++; LA64F-NEXT:    lu52i.d $s0, $zero, 1023
++; LA64F-NEXT:    ori $s1, $zero, 8
++; LA64F-NEXT:    addi.d $s2, $sp, 16
++; LA64F-NEXT:    addi.d $s3, $sp, 8
++; LA64F-NEXT:    ori $s4, $zero, 5
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB31_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64F-NEXT:    st.d $a0, $sp, 16
++; LA64F-NEXT:    move $a1, $s0
++; LA64F-NEXT:    bl %plt(fmax)
++; LA64F-NEXT:    st.d $a0, $sp, 8
++; LA64F-NEXT:    move $a0, $s1
++; LA64F-NEXT:    move $a1, $fp
++; LA64F-NEXT:    move $a2, $s2
++; LA64F-NEXT:    move $a3, $s3
++; LA64F-NEXT:    move $a4, $s4
++; LA64F-NEXT:    move $a5, $s4
++; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64F-NEXT:    move $a1, $a0
++; LA64F-NEXT:    ld.d $a0, $sp, 16
++; LA64F-NEXT:    beqz $a1, .LBB31_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64F-NEXT:    addi.d $sp, $sp, 80
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: double_fmax_seq_cst:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    addi.d $sp, $sp, -80
++; LA64D-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
++; LA64D-NEXT:    fst.d $fs0, $sp, 24 # 8-byte Folded Spill
++; LA64D-NEXT:    move $fp, $a0
++; LA64D-NEXT:    fld.d $fa0, $a0, 0
++; LA64D-NEXT:    addi.d $a0, $zero, 1
++; LA64D-NEXT:    movgr2fr.d $fa1, $a0
++; LA64D-NEXT:    ffint.d.l $fs0, $fa1
++; LA64D-NEXT:    ori $s0, $zero, 8
++; LA64D-NEXT:    addi.d $s1, $sp, 16
++; LA64D-NEXT:    addi.d $s2, $sp, 8
++; LA64D-NEXT:    ori $s3, $zero, 5
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB31_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64D-NEXT:    fst.d $fa0, $sp, 16
++; LA64D-NEXT:    fmax.d $fa0, $fa0, $fa0
++; LA64D-NEXT:    fmax.d $fa0, $fa0, $fs0
++; LA64D-NEXT:    fst.d $fa0, $sp, 8
++; LA64D-NEXT:    move $a0, $s0
++; LA64D-NEXT:    move $a1, $fp
++; LA64D-NEXT:    move $a2, $s1
++; LA64D-NEXT:    move $a3, $s2
++; LA64D-NEXT:    move $a4, $s3
++; LA64D-NEXT:    move $a5, $s3
++; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64D-NEXT:    fld.d $fa0, $sp, 16
++; LA64D-NEXT:    beqz $a0, .LBB31_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    fld.d $fs0, $sp, 24 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
++; LA64D-NEXT:    addi.d $sp, $sp, 80
++; LA64D-NEXT:    ret
++  %v = atomicrmw fmax ptr %p, double 1.0 seq_cst, align 4
++  ret double %v
++}
++
++define float @float_fadd_monotonic(ptr %p) nounwind {
++; LA64F-LABEL: float_fadd_monotonic:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    fld.s $fa0, $a0, 0
++; LA64F-NEXT:    addi.w $a1, $zero, 1
++; LA64F-NEXT:    movgr2fr.w $fa1, $a1
++; LA64F-NEXT:    ffint.s.w $fa1, $fa1
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB32_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Loop Header: Depth=1
++; LA64F-NEXT:    # Child Loop BB32_3 Depth 2
++; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
++; LA64F-NEXT:    movfr2gr.s $a1, $fa2
++; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:  .LBB32_3: # %atomicrmw.start
++; LA64F-NEXT:    # Parent Loop BB32_1 Depth=1
++; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64F-NEXT:    ll.w $a3, $a0, 0
++; LA64F-NEXT:    bne $a3, $a2, .LBB32_5
++; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB32_3 Depth=2
++; LA64F-NEXT:    move $a4, $a1
++; LA64F-NEXT:    sc.w $a4, $a0, 0
++; LA64F-NEXT:    beqz $a4, .LBB32_3
++; LA64F-NEXT:    b .LBB32_6
++; LA64F-NEXT:  .LBB32_5: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB32_1 Depth=1
++; LA64F-NEXT:    dbar 1792
++; LA64F-NEXT:  .LBB32_6: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB32_1 Depth=1
++; LA64F-NEXT:    movgr2fr.w $fa0, $a3
++; LA64F-NEXT:    addi.w $a1, $a2, 0
++; LA64F-NEXT:    bne $a3, $a1, .LBB32_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: float_fadd_monotonic:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    fld.s $fa0, $a0, 0
++; LA64D-NEXT:    addi.w $a1, $zero, 1
++; LA64D-NEXT:    movgr2fr.w $fa1, $a1
++; LA64D-NEXT:    ffint.s.w $fa1, $fa1
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB32_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Loop Header: Depth=1
++; LA64D-NEXT:    # Child Loop BB32_3 Depth 2
++; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
++; LA64D-NEXT:    movfr2gr.s $a1, $fa2
++; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:  .LBB32_3: # %atomicrmw.start
++; LA64D-NEXT:    # Parent Loop BB32_1 Depth=1
++; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64D-NEXT:    ll.w $a3, $a0, 0
++; LA64D-NEXT:    bne $a3, $a2, .LBB32_5
++; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB32_3 Depth=2
++; LA64D-NEXT:    move $a4, $a1
++; LA64D-NEXT:    sc.w $a4, $a0, 0
++; LA64D-NEXT:    beqz $a4, .LBB32_3
++; LA64D-NEXT:    b .LBB32_6
++; LA64D-NEXT:  .LBB32_5: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB32_1 Depth=1
++; LA64D-NEXT:    dbar 1792
++; LA64D-NEXT:  .LBB32_6: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB32_1 Depth=1
++; LA64D-NEXT:    movgr2fr.w $fa0, $a3
++; LA64D-NEXT:    addi.w $a1, $a2, 0
++; LA64D-NEXT:    bne $a3, $a1, .LBB32_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    ret
++  %v = atomicrmw fadd ptr %p, float 1.0 monotonic, align 4
++  ret float %v
++}
++
++define float @float_fsub_monotonic(ptr %p) nounwind {
++; LA64F-LABEL: float_fsub_monotonic:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    fld.s $fa0, $a0, 0
++; LA64F-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI33_0)
++; LA64F-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI33_0)
++; LA64F-NEXT:    fld.s $fa1, $a1, 0
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB33_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Loop Header: Depth=1
++; LA64F-NEXT:    # Child Loop BB33_3 Depth 2
++; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
++; LA64F-NEXT:    movfr2gr.s $a1, $fa2
++; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:  .LBB33_3: # %atomicrmw.start
++; LA64F-NEXT:    # Parent Loop BB33_1 Depth=1
++; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64F-NEXT:    ll.w $a3, $a0, 0
++; LA64F-NEXT:    bne $a3, $a2, .LBB33_5
++; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB33_3 Depth=2
++; LA64F-NEXT:    move $a4, $a1
++; LA64F-NEXT:    sc.w $a4, $a0, 0
++; LA64F-NEXT:    beqz $a4, .LBB33_3
++; LA64F-NEXT:    b .LBB33_6
++; LA64F-NEXT:  .LBB33_5: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB33_1 Depth=1
++; LA64F-NEXT:    dbar 1792
++; LA64F-NEXT:  .LBB33_6: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB33_1 Depth=1
++; LA64F-NEXT:    movgr2fr.w $fa0, $a3
++; LA64F-NEXT:    addi.w $a1, $a2, 0
++; LA64F-NEXT:    bne $a3, $a1, .LBB33_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: float_fsub_monotonic:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    fld.s $fa0, $a0, 0
++; LA64D-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI33_0)
++; LA64D-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI33_0)
++; LA64D-NEXT:    fld.s $fa1, $a1, 0
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB33_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Loop Header: Depth=1
++; LA64D-NEXT:    # Child Loop BB33_3 Depth 2
++; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
++; LA64D-NEXT:    movfr2gr.s $a1, $fa2
++; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:  .LBB33_3: # %atomicrmw.start
++; LA64D-NEXT:    # Parent Loop BB33_1 Depth=1
++; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64D-NEXT:    ll.w $a3, $a0, 0
++; LA64D-NEXT:    bne $a3, $a2, .LBB33_5
++; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB33_3 Depth=2
++; LA64D-NEXT:    move $a4, $a1
++; LA64D-NEXT:    sc.w $a4, $a0, 0
++; LA64D-NEXT:    beqz $a4, .LBB33_3
++; LA64D-NEXT:    b .LBB33_6
++; LA64D-NEXT:  .LBB33_5: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB33_1 Depth=1
++; LA64D-NEXT:    dbar 1792
++; LA64D-NEXT:  .LBB33_6: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB33_1 Depth=1
++; LA64D-NEXT:    movgr2fr.w $fa0, $a3
++; LA64D-NEXT:    addi.w $a1, $a2, 0
++; LA64D-NEXT:    bne $a3, $a1, .LBB33_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    ret
++  %v = atomicrmw fsub ptr %p, float 1.0 monotonic, align 4
++  ret float %v
++}
++
++define float @float_fmin_monotonic(ptr %p) nounwind {
++; LA64F-LABEL: float_fmin_monotonic:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    fld.s $fa0, $a0, 0
++; LA64F-NEXT:    addi.w $a1, $zero, 1
++; LA64F-NEXT:    movgr2fr.w $fa1, $a1
++; LA64F-NEXT:    ffint.s.w $fa1, $fa1
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB34_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Loop Header: Depth=1
++; LA64F-NEXT:    # Child Loop BB34_3 Depth 2
++; LA64F-NEXT:    fmax.s $fa2, $fa0, $fa0
++; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
++; LA64F-NEXT:    movfr2gr.s $a1, $fa2
++; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:  .LBB34_3: # %atomicrmw.start
++; LA64F-NEXT:    # Parent Loop BB34_1 Depth=1
++; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64F-NEXT:    ll.w $a3, $a0, 0
++; LA64F-NEXT:    bne $a3, $a2, .LBB34_5
++; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB34_3 Depth=2
++; LA64F-NEXT:    move $a4, $a1
++; LA64F-NEXT:    sc.w $a4, $a0, 0
++; LA64F-NEXT:    beqz $a4, .LBB34_3
++; LA64F-NEXT:    b .LBB34_6
++; LA64F-NEXT:  .LBB34_5: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB34_1 Depth=1
++; LA64F-NEXT:    dbar 1792
++; LA64F-NEXT:  .LBB34_6: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB34_1 Depth=1
++; LA64F-NEXT:    movgr2fr.w $fa0, $a3
++; LA64F-NEXT:    addi.w $a1, $a2, 0
++; LA64F-NEXT:    bne $a3, $a1, .LBB34_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: float_fmin_monotonic:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    fld.s $fa0, $a0, 0
++; LA64D-NEXT:    addi.w $a1, $zero, 1
++; LA64D-NEXT:    movgr2fr.w $fa1, $a1
++; LA64D-NEXT:    ffint.s.w $fa1, $fa1
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB34_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Loop Header: Depth=1
++; LA64D-NEXT:    # Child Loop BB34_3 Depth 2
++; LA64D-NEXT:    fmax.s $fa2, $fa0, $fa0
++; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
++; LA64D-NEXT:    movfr2gr.s $a1, $fa2
++; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:  .LBB34_3: # %atomicrmw.start
++; LA64D-NEXT:    # Parent Loop BB34_1 Depth=1
++; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64D-NEXT:    ll.w $a3, $a0, 0
++; LA64D-NEXT:    bne $a3, $a2, .LBB34_5
++; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB34_3 Depth=2
++; LA64D-NEXT:    move $a4, $a1
++; LA64D-NEXT:    sc.w $a4, $a0, 0
++; LA64D-NEXT:    beqz $a4, .LBB34_3
++; LA64D-NEXT:    b .LBB34_6
++; LA64D-NEXT:  .LBB34_5: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB34_1 Depth=1
++; LA64D-NEXT:    dbar 1792
++; LA64D-NEXT:  .LBB34_6: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB34_1 Depth=1
++; LA64D-NEXT:    movgr2fr.w $fa0, $a3
++; LA64D-NEXT:    addi.w $a1, $a2, 0
++; LA64D-NEXT:    bne $a3, $a1, .LBB34_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    ret
++  %v = atomicrmw fmin ptr %p, float 1.0 monotonic, align 4
++  ret float %v
++}
++
++define float @float_fmax_monotonic(ptr %p) nounwind {
++; LA64F-LABEL: float_fmax_monotonic:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    fld.s $fa0, $a0, 0
++; LA64F-NEXT:    addi.w $a1, $zero, 1
++; LA64F-NEXT:    movgr2fr.w $fa1, $a1
++; LA64F-NEXT:    ffint.s.w $fa1, $fa1
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB35_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Loop Header: Depth=1
++; LA64F-NEXT:    # Child Loop BB35_3 Depth 2
++; LA64F-NEXT:    fmax.s $fa2, $fa0, $fa0
++; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
++; LA64F-NEXT:    movfr2gr.s $a1, $fa2
++; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:  .LBB35_3: # %atomicrmw.start
++; LA64F-NEXT:    # Parent Loop BB35_1 Depth=1
++; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64F-NEXT:    ll.w $a3, $a0, 0
++; LA64F-NEXT:    bne $a3, $a2, .LBB35_5
++; LA64F-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB35_3 Depth=2
++; LA64F-NEXT:    move $a4, $a1
++; LA64F-NEXT:    sc.w $a4, $a0, 0
++; LA64F-NEXT:    beqz $a4, .LBB35_3
++; LA64F-NEXT:    b .LBB35_6
++; LA64F-NEXT:  .LBB35_5: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB35_1 Depth=1
++; LA64F-NEXT:    dbar 1792
++; LA64F-NEXT:  .LBB35_6: # %atomicrmw.start
++; LA64F-NEXT:    # in Loop: Header=BB35_1 Depth=1
++; LA64F-NEXT:    movgr2fr.w $fa0, $a3
++; LA64F-NEXT:    addi.w $a1, $a2, 0
++; LA64F-NEXT:    bne $a3, $a1, .LBB35_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: float_fmax_monotonic:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    fld.s $fa0, $a0, 0
++; LA64D-NEXT:    addi.w $a1, $zero, 1
++; LA64D-NEXT:    movgr2fr.w $fa1, $a1
++; LA64D-NEXT:    ffint.s.w $fa1, $fa1
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB35_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Loop Header: Depth=1
++; LA64D-NEXT:    # Child Loop BB35_3 Depth 2
++; LA64D-NEXT:    fmax.s $fa2, $fa0, $fa0
++; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
++; LA64D-NEXT:    movfr2gr.s $a1, $fa2
++; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:  .LBB35_3: # %atomicrmw.start
++; LA64D-NEXT:    # Parent Loop BB35_1 Depth=1
++; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
++; LA64D-NEXT:    ll.w $a3, $a0, 0
++; LA64D-NEXT:    bne $a3, $a2, .LBB35_5
++; LA64D-NEXT:  # %bb.4: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB35_3 Depth=2
++; LA64D-NEXT:    move $a4, $a1
++; LA64D-NEXT:    sc.w $a4, $a0, 0
++; LA64D-NEXT:    beqz $a4, .LBB35_3
++; LA64D-NEXT:    b .LBB35_6
++; LA64D-NEXT:  .LBB35_5: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB35_1 Depth=1
++; LA64D-NEXT:    dbar 1792
++; LA64D-NEXT:  .LBB35_6: # %atomicrmw.start
++; LA64D-NEXT:    # in Loop: Header=BB35_1 Depth=1
++; LA64D-NEXT:    movgr2fr.w $fa0, $a3
++; LA64D-NEXT:    addi.w $a1, $a2, 0
++; LA64D-NEXT:    bne $a3, $a1, .LBB35_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    ret
++  %v = atomicrmw fmax ptr %p, float 1.0 monotonic, align 4
++  ret float %v
++}
++
++define double @double_fadd_monotonic(ptr %p) nounwind {
++; LA64F-LABEL: double_fadd_monotonic:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    addi.d $sp, $sp, -64
++; LA64F-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s0, $sp, 40 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s1, $sp, 32 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s2, $sp, 24 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s3, $sp, 16 # 8-byte Folded Spill
++; LA64F-NEXT:    move $fp, $a0
++; LA64F-NEXT:    ld.d $a0, $a0, 0
++; LA64F-NEXT:    lu52i.d $s0, $zero, 1023
++; LA64F-NEXT:    ori $s1, $zero, 8
++; LA64F-NEXT:    addi.d $s2, $sp, 8
++; LA64F-NEXT:    addi.d $s3, $sp, 0
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB36_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64F-NEXT:    st.d $a0, $sp, 8
++; LA64F-NEXT:    move $a1, $s0
++; LA64F-NEXT:    bl %plt(__adddf3)
++; LA64F-NEXT:    st.d $a0, $sp, 0
++; LA64F-NEXT:    move $a0, $s1
++; LA64F-NEXT:    move $a1, $fp
++; LA64F-NEXT:    move $a2, $s2
++; LA64F-NEXT:    move $a3, $s3
++; LA64F-NEXT:    move $a4, $zero
++; LA64F-NEXT:    move $a5, $zero
++; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64F-NEXT:    move $a1, $a0
++; LA64F-NEXT:    ld.d $a0, $sp, 8
++; LA64F-NEXT:    beqz $a1, .LBB36_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ld.d $s3, $sp, 16 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s2, $sp, 24 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s1, $sp, 32 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s0, $sp, 40 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; LA64F-NEXT:    addi.d $sp, $sp, 64
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: double_fadd_monotonic:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    addi.d $sp, $sp, -64
++; LA64D-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s0, $sp, 40 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s1, $sp, 32 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s2, $sp, 24 # 8-byte Folded Spill
++; LA64D-NEXT:    fst.d $fs0, $sp, 16 # 8-byte Folded Spill
++; LA64D-NEXT:    move $fp, $a0
++; LA64D-NEXT:    fld.d $fa0, $a0, 0
++; LA64D-NEXT:    addi.d $a0, $zero, 1
++; LA64D-NEXT:    movgr2fr.d $fa1, $a0
++; LA64D-NEXT:    ffint.d.l $fs0, $fa1
++; LA64D-NEXT:    ori $s0, $zero, 8
++; LA64D-NEXT:    addi.d $s1, $sp, 8
++; LA64D-NEXT:    addi.d $s2, $sp, 0
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB36_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64D-NEXT:    fst.d $fa0, $sp, 8
++; LA64D-NEXT:    fadd.d $fa0, $fa0, $fs0
++; LA64D-NEXT:    fst.d $fa0, $sp, 0
++; LA64D-NEXT:    move $a0, $s0
++; LA64D-NEXT:    move $a1, $fp
++; LA64D-NEXT:    move $a2, $s1
++; LA64D-NEXT:    move $a3, $s2
++; LA64D-NEXT:    move $a4, $zero
++; LA64D-NEXT:    move $a5, $zero
++; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64D-NEXT:    fld.d $fa0, $sp, 8
++; LA64D-NEXT:    beqz $a0, .LBB36_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    fld.d $fs0, $sp, 16 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s2, $sp, 24 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s1, $sp, 32 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s0, $sp, 40 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; LA64D-NEXT:    addi.d $sp, $sp, 64
++; LA64D-NEXT:    ret
++  %v = atomicrmw fadd ptr %p, double 1.0 monotonic, align 4
++  ret double %v
++}
++
++define double @double_fsub_monotonic(ptr %p) nounwind {
++; LA64F-LABEL: double_fsub_monotonic:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    addi.d $sp, $sp, -64
++; LA64F-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s0, $sp, 40 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s1, $sp, 32 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s2, $sp, 24 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s3, $sp, 16 # 8-byte Folded Spill
++; LA64F-NEXT:    move $fp, $a0
++; LA64F-NEXT:    ld.d $a0, $a0, 0
++; LA64F-NEXT:    lu52i.d $s0, $zero, -1025
++; LA64F-NEXT:    ori $s1, $zero, 8
++; LA64F-NEXT:    addi.d $s2, $sp, 8
++; LA64F-NEXT:    addi.d $s3, $sp, 0
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB37_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64F-NEXT:    st.d $a0, $sp, 8
++; LA64F-NEXT:    move $a1, $s0
++; LA64F-NEXT:    bl %plt(__adddf3)
++; LA64F-NEXT:    st.d $a0, $sp, 0
++; LA64F-NEXT:    move $a0, $s1
++; LA64F-NEXT:    move $a1, $fp
++; LA64F-NEXT:    move $a2, $s2
++; LA64F-NEXT:    move $a3, $s3
++; LA64F-NEXT:    move $a4, $zero
++; LA64F-NEXT:    move $a5, $zero
++; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64F-NEXT:    move $a1, $a0
++; LA64F-NEXT:    ld.d $a0, $sp, 8
++; LA64F-NEXT:    beqz $a1, .LBB37_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ld.d $s3, $sp, 16 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s2, $sp, 24 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s1, $sp, 32 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s0, $sp, 40 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; LA64F-NEXT:    addi.d $sp, $sp, 64
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: double_fsub_monotonic:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    addi.d $sp, $sp, -64
++; LA64D-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s0, $sp, 40 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s1, $sp, 32 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s2, $sp, 24 # 8-byte Folded Spill
++; LA64D-NEXT:    fst.d $fs0, $sp, 16 # 8-byte Folded Spill
++; LA64D-NEXT:    move $fp, $a0
++; LA64D-NEXT:    fld.d $fa0, $a0, 0
++; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI37_0)
++; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI37_0)
++; LA64D-NEXT:    fld.d $fs0, $a0, 0
++; LA64D-NEXT:    ori $s0, $zero, 8
++; LA64D-NEXT:    addi.d $s1, $sp, 8
++; LA64D-NEXT:    addi.d $s2, $sp, 0
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB37_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64D-NEXT:    fst.d $fa0, $sp, 8
++; LA64D-NEXT:    fadd.d $fa0, $fa0, $fs0
++; LA64D-NEXT:    fst.d $fa0, $sp, 0
++; LA64D-NEXT:    move $a0, $s0
++; LA64D-NEXT:    move $a1, $fp
++; LA64D-NEXT:    move $a2, $s1
++; LA64D-NEXT:    move $a3, $s2
++; LA64D-NEXT:    move $a4, $zero
++; LA64D-NEXT:    move $a5, $zero
++; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64D-NEXT:    fld.d $fa0, $sp, 8
++; LA64D-NEXT:    beqz $a0, .LBB37_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    fld.d $fs0, $sp, 16 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s2, $sp, 24 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s1, $sp, 32 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s0, $sp, 40 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; LA64D-NEXT:    addi.d $sp, $sp, 64
++; LA64D-NEXT:    ret
++  %v = atomicrmw fsub ptr %p, double 1.0 monotonic, align 4
++  ret double %v
++}
++
++define double @double_fmin_monotonic(ptr %p) nounwind {
++; LA64F-LABEL: double_fmin_monotonic:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    addi.d $sp, $sp, -64
++; LA64F-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s0, $sp, 40 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s1, $sp, 32 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s2, $sp, 24 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s3, $sp, 16 # 8-byte Folded Spill
++; LA64F-NEXT:    move $fp, $a0
++; LA64F-NEXT:    ld.d $a0, $a0, 0
++; LA64F-NEXT:    lu52i.d $s0, $zero, 1023
++; LA64F-NEXT:    ori $s1, $zero, 8
++; LA64F-NEXT:    addi.d $s2, $sp, 8
++; LA64F-NEXT:    addi.d $s3, $sp, 0
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB38_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64F-NEXT:    st.d $a0, $sp, 8
++; LA64F-NEXT:    move $a1, $s0
++; LA64F-NEXT:    bl %plt(fmin)
++; LA64F-NEXT:    st.d $a0, $sp, 0
++; LA64F-NEXT:    move $a0, $s1
++; LA64F-NEXT:    move $a1, $fp
++; LA64F-NEXT:    move $a2, $s2
++; LA64F-NEXT:    move $a3, $s3
++; LA64F-NEXT:    move $a4, $zero
++; LA64F-NEXT:    move $a5, $zero
++; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64F-NEXT:    move $a1, $a0
++; LA64F-NEXT:    ld.d $a0, $sp, 8
++; LA64F-NEXT:    beqz $a1, .LBB38_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ld.d $s3, $sp, 16 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s2, $sp, 24 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s1, $sp, 32 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s0, $sp, 40 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; LA64F-NEXT:    addi.d $sp, $sp, 64
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: double_fmin_monotonic:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    addi.d $sp, $sp, -64
++; LA64D-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s0, $sp, 40 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s1, $sp, 32 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s2, $sp, 24 # 8-byte Folded Spill
++; LA64D-NEXT:    fst.d $fs0, $sp, 16 # 8-byte Folded Spill
++; LA64D-NEXT:    move $fp, $a0
++; LA64D-NEXT:    fld.d $fa0, $a0, 0
++; LA64D-NEXT:    addi.d $a0, $zero, 1
++; LA64D-NEXT:    movgr2fr.d $fa1, $a0
++; LA64D-NEXT:    ffint.d.l $fs0, $fa1
++; LA64D-NEXT:    ori $s0, $zero, 8
++; LA64D-NEXT:    addi.d $s1, $sp, 8
++; LA64D-NEXT:    addi.d $s2, $sp, 0
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB38_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64D-NEXT:    fst.d $fa0, $sp, 8
++; LA64D-NEXT:    fmax.d $fa0, $fa0, $fa0
++; LA64D-NEXT:    fmin.d $fa0, $fa0, $fs0
++; LA64D-NEXT:    fst.d $fa0, $sp, 0
++; LA64D-NEXT:    move $a0, $s0
++; LA64D-NEXT:    move $a1, $fp
++; LA64D-NEXT:    move $a2, $s1
++; LA64D-NEXT:    move $a3, $s2
++; LA64D-NEXT:    move $a4, $zero
++; LA64D-NEXT:    move $a5, $zero
++; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64D-NEXT:    fld.d $fa0, $sp, 8
++; LA64D-NEXT:    beqz $a0, .LBB38_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    fld.d $fs0, $sp, 16 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s2, $sp, 24 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s1, $sp, 32 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s0, $sp, 40 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; LA64D-NEXT:    addi.d $sp, $sp, 64
++; LA64D-NEXT:    ret
++  %v = atomicrmw fmin ptr %p, double 1.0 monotonic, align 4
++  ret double %v
++}
++
++define double @double_fmax_monotonic(ptr %p) nounwind {
++; LA64F-LABEL: double_fmax_monotonic:
++; LA64F:       # %bb.0:
++; LA64F-NEXT:    addi.d $sp, $sp, -64
++; LA64F-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s0, $sp, 40 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s1, $sp, 32 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s2, $sp, 24 # 8-byte Folded Spill
++; LA64F-NEXT:    st.d $s3, $sp, 16 # 8-byte Folded Spill
++; LA64F-NEXT:    move $fp, $a0
++; LA64F-NEXT:    ld.d $a0, $a0, 0
++; LA64F-NEXT:    lu52i.d $s0, $zero, 1023
++; LA64F-NEXT:    ori $s1, $zero, 8
++; LA64F-NEXT:    addi.d $s2, $sp, 8
++; LA64F-NEXT:    addi.d $s3, $sp, 0
++; LA64F-NEXT:    .p2align 4, , 16
++; LA64F-NEXT:  .LBB39_1: # %atomicrmw.start
++; LA64F-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64F-NEXT:    st.d $a0, $sp, 8
++; LA64F-NEXT:    move $a1, $s0
++; LA64F-NEXT:    bl %plt(fmax)
++; LA64F-NEXT:    st.d $a0, $sp, 0
++; LA64F-NEXT:    move $a0, $s1
++; LA64F-NEXT:    move $a1, $fp
++; LA64F-NEXT:    move $a2, $s2
++; LA64F-NEXT:    move $a3, $s3
++; LA64F-NEXT:    move $a4, $zero
++; LA64F-NEXT:    move $a5, $zero
++; LA64F-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64F-NEXT:    move $a1, $a0
++; LA64F-NEXT:    ld.d $a0, $sp, 8
++; LA64F-NEXT:    beqz $a1, .LBB39_1
++; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64F-NEXT:    ld.d $s3, $sp, 16 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s2, $sp, 24 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s1, $sp, 32 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $s0, $sp, 40 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; LA64F-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; LA64F-NEXT:    addi.d $sp, $sp, 64
++; LA64F-NEXT:    ret
++;
++; LA64D-LABEL: double_fmax_monotonic:
++; LA64D:       # %bb.0:
++; LA64D-NEXT:    addi.d $sp, $sp, -64
++; LA64D-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s0, $sp, 40 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s1, $sp, 32 # 8-byte Folded Spill
++; LA64D-NEXT:    st.d $s2, $sp, 24 # 8-byte Folded Spill
++; LA64D-NEXT:    fst.d $fs0, $sp, 16 # 8-byte Folded Spill
++; LA64D-NEXT:    move $fp, $a0
++; LA64D-NEXT:    fld.d $fa0, $a0, 0
++; LA64D-NEXT:    addi.d $a0, $zero, 1
++; LA64D-NEXT:    movgr2fr.d $fa1, $a0
++; LA64D-NEXT:    ffint.d.l $fs0, $fa1
++; LA64D-NEXT:    ori $s0, $zero, 8
++; LA64D-NEXT:    addi.d $s1, $sp, 8
++; LA64D-NEXT:    addi.d $s2, $sp, 0
++; LA64D-NEXT:    .p2align 4, , 16
++; LA64D-NEXT:  .LBB39_1: # %atomicrmw.start
++; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
++; LA64D-NEXT:    fst.d $fa0, $sp, 8
++; LA64D-NEXT:    fmax.d $fa0, $fa0, $fa0
++; LA64D-NEXT:    fmax.d $fa0, $fa0, $fs0
++; LA64D-NEXT:    fst.d $fa0, $sp, 0
++; LA64D-NEXT:    move $a0, $s0
++; LA64D-NEXT:    move $a1, $fp
++; LA64D-NEXT:    move $a2, $s1
++; LA64D-NEXT:    move $a3, $s2
++; LA64D-NEXT:    move $a4, $zero
++; LA64D-NEXT:    move $a5, $zero
++; LA64D-NEXT:    bl %plt(__atomic_compare_exchange)
++; LA64D-NEXT:    fld.d $fa0, $sp, 8
++; LA64D-NEXT:    beqz $a0, .LBB39_1
++; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
++; LA64D-NEXT:    fld.d $fs0, $sp, 16 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s2, $sp, 24 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s1, $sp, 32 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $s0, $sp, 40 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
++; LA64D-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
++; LA64D-NEXT:    addi.d $sp, $sp, 64
++; LA64D-NEXT:    ret
++  %v = atomicrmw fmax ptr %p, double 1.0 monotonic, align 4
++  ret double %v
++}
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll
+index 26ba77e8d4fd..770358a05bfd 100644
+--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll
+@@ -353,3 +353,1403 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
+   %1 = atomicrmw min ptr %a, i64 %b acquire
+   ret i64 %1
+ }
++
++define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
++; LA64-LABEL: atomicrmw_umax_i8_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a6, $a4, $a3
++; LA64-NEXT:    move $a5, $a4
++; LA64-NEXT:    bgeu $a6, $a1, .LBB16_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB16_1 Depth=1
++; LA64-NEXT:    xor $a5, $a4, $a1
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:  .LBB16_3: # in Loop: Header=BB16_1 Depth=1
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB16_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw umax ptr %a, i8 %b release
++  ret i8 %1
++}
++
++define i16 @atomicrmw_umax_i16_release(ptr %a, i16 %b) nounwind {
++; LA64-LABEL: atomicrmw_umax_i16_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a6, $a4, $a3
++; LA64-NEXT:    move $a5, $a4
++; LA64-NEXT:    bgeu $a6, $a1, .LBB17_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB17_1 Depth=1
++; LA64-NEXT:    xor $a5, $a4, $a1
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:  .LBB17_3: # in Loop: Header=BB17_1 Depth=1
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB17_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw umax ptr %a, i16 %b release
++  ret i16 %1
++}
++
++define i32 @atomicrmw_umax_i32_release(ptr %a, i32 %b) nounwind {
++; LA64-LABEL: atomicrmw_umax_i32_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammax_db.wu $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw umax ptr %a, i32 %b release
++  ret i32 %1
++}
++
++define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
++; LA64-LABEL: atomicrmw_umax_i64_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammax_db.du $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw umax ptr %a, i64 %b release
++  ret i64 %1
++}
++
++define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
++; LA64-LABEL: atomicrmw_umin_i8_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a6, $a4, $a3
++; LA64-NEXT:    move $a5, $a4
++; LA64-NEXT:    bgeu $a1, $a6, .LBB20_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB20_1 Depth=1
++; LA64-NEXT:    xor $a5, $a4, $a1
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:  .LBB20_3: # in Loop: Header=BB20_1 Depth=1
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB20_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw umin ptr %a, i8 %b release
++  ret i8 %1
++}
++
++define i16 @atomicrmw_umin_i16_release(ptr %a, i16 %b) nounwind {
++; LA64-LABEL: atomicrmw_umin_i16_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a6, $a4, $a3
++; LA64-NEXT:    move $a5, $a4
++; LA64-NEXT:    bgeu $a1, $a6, .LBB21_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB21_1 Depth=1
++; LA64-NEXT:    xor $a5, $a4, $a1
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:  .LBB21_3: # in Loop: Header=BB21_1 Depth=1
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB21_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw umin ptr %a, i16 %b release
++  ret i16 %1
++}
++
++define i32 @atomicrmw_umin_i32_release(ptr %a, i32 %b) nounwind {
++; LA64-LABEL: atomicrmw_umin_i32_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammin_db.wu $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw umin ptr %a, i32 %b release
++  ret i32 %1
++}
++
++define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
++; LA64-LABEL: atomicrmw_umin_i64_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammin_db.du $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw umin ptr %a, i64 %b release
++  ret i64 %1
++}
++
++define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
++; LA64-LABEL: atomicrmw_max_i8_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    ext.w.b $a1, $a1
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:    andi $a4, $a0, 24
++; LA64-NEXT:    xori $a4, $a4, 56
++; LA64-NEXT:  .LBB24_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a2, 0
++; LA64-NEXT:    and $a7, $a5, $a3
++; LA64-NEXT:    move $a6, $a5
++; LA64-NEXT:    sll.w $a7, $a7, $a4
++; LA64-NEXT:    sra.w $a7, $a7, $a4
++; LA64-NEXT:    bge $a7, $a1, .LBB24_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB24_1 Depth=1
++; LA64-NEXT:    xor $a6, $a5, $a1
++; LA64-NEXT:    and $a6, $a6, $a3
++; LA64-NEXT:    xor $a6, $a5, $a6
++; LA64-NEXT:  .LBB24_3: # in Loop: Header=BB24_1 Depth=1
++; LA64-NEXT:    sc.w $a6, $a2, 0
++; LA64-NEXT:    beqz $a6, .LBB24_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw max ptr %a, i8 %b release
++  ret i8 %1
++}
++
++define i16 @atomicrmw_max_i16_release(ptr %a, i16 %b) nounwind {
++; LA64-LABEL: atomicrmw_max_i16_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    andi $a3, $a0, 24
++; LA64-NEXT:    ori $a4, $zero, 48
++; LA64-NEXT:    sub.d $a3, $a4, $a3
++; LA64-NEXT:    lu12i.w $a4, 15
++; LA64-NEXT:    ori $a4, $a4, 4095
++; LA64-NEXT:    sll.w $a4, $a4, $a0
++; LA64-NEXT:    addi.w $a4, $a4, 0
++; LA64-NEXT:    ext.w.h $a1, $a1
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB25_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a2, 0
++; LA64-NEXT:    and $a7, $a5, $a4
++; LA64-NEXT:    move $a6, $a5
++; LA64-NEXT:    sll.w $a7, $a7, $a3
++; LA64-NEXT:    sra.w $a7, $a7, $a3
++; LA64-NEXT:    bge $a7, $a1, .LBB25_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB25_1 Depth=1
++; LA64-NEXT:    xor $a6, $a5, $a1
++; LA64-NEXT:    and $a6, $a6, $a4
++; LA64-NEXT:    xor $a6, $a5, $a6
++; LA64-NEXT:  .LBB25_3: # in Loop: Header=BB25_1 Depth=1
++; LA64-NEXT:    sc.w $a6, $a2, 0
++; LA64-NEXT:    beqz $a6, .LBB25_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw max ptr %a, i16 %b release
++  ret i16 %1
++}
++
++define i32 @atomicrmw_max_i32_release(ptr %a, i32 %b) nounwind {
++; LA64-LABEL: atomicrmw_max_i32_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammax_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw max ptr %a, i32 %b release
++  ret i32 %1
++}
++
++define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
++; LA64-LABEL: atomicrmw_max_i64_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammax_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw max ptr %a, i64 %b release
++  ret i64 %1
++}
++
++define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
++; LA64-LABEL: atomicrmw_min_i8_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    ext.w.b $a1, $a1
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:    andi $a4, $a0, 24
++; LA64-NEXT:    xori $a4, $a4, 56
++; LA64-NEXT:  .LBB28_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a2, 0
++; LA64-NEXT:    and $a7, $a5, $a3
++; LA64-NEXT:    move $a6, $a5
++; LA64-NEXT:    sll.w $a7, $a7, $a4
++; LA64-NEXT:    sra.w $a7, $a7, $a4
++; LA64-NEXT:    bge $a1, $a7, .LBB28_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB28_1 Depth=1
++; LA64-NEXT:    xor $a6, $a5, $a1
++; LA64-NEXT:    and $a6, $a6, $a3
++; LA64-NEXT:    xor $a6, $a5, $a6
++; LA64-NEXT:  .LBB28_3: # in Loop: Header=BB28_1 Depth=1
++; LA64-NEXT:    sc.w $a6, $a2, 0
++; LA64-NEXT:    beqz $a6, .LBB28_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw min ptr %a, i8 %b release
++  ret i8 %1
++}
++
++define i16 @atomicrmw_min_i16_release(ptr %a, i16 %b) nounwind {
++; LA64-LABEL: atomicrmw_min_i16_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    andi $a3, $a0, 24
++; LA64-NEXT:    ori $a4, $zero, 48
++; LA64-NEXT:    sub.d $a3, $a4, $a3
++; LA64-NEXT:    lu12i.w $a4, 15
++; LA64-NEXT:    ori $a4, $a4, 4095
++; LA64-NEXT:    sll.w $a4, $a4, $a0
++; LA64-NEXT:    addi.w $a4, $a4, 0
++; LA64-NEXT:    ext.w.h $a1, $a1
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB29_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a2, 0
++; LA64-NEXT:    and $a7, $a5, $a4
++; LA64-NEXT:    move $a6, $a5
++; LA64-NEXT:    sll.w $a7, $a7, $a3
++; LA64-NEXT:    sra.w $a7, $a7, $a3
++; LA64-NEXT:    bge $a1, $a7, .LBB29_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB29_1 Depth=1
++; LA64-NEXT:    xor $a6, $a5, $a1
++; LA64-NEXT:    and $a6, $a6, $a4
++; LA64-NEXT:    xor $a6, $a5, $a6
++; LA64-NEXT:  .LBB29_3: # in Loop: Header=BB29_1 Depth=1
++; LA64-NEXT:    sc.w $a6, $a2, 0
++; LA64-NEXT:    beqz $a6, .LBB29_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw min ptr %a, i16 %b release
++  ret i16 %1
++}
++
++define i32 @atomicrmw_min_i32_release(ptr %a, i32 %b) nounwind {
++; LA64-LABEL: atomicrmw_min_i32_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammin_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw min ptr %a, i32 %b release
++  ret i32 %1
++}
++
++define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
++; LA64-LABEL: atomicrmw_min_i64_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammin_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw min ptr %a, i64 %b release
++  ret i64 %1
++}
++
++define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
++; LA64-LABEL: atomicrmw_umax_i8_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a6, $a4, $a3
++; LA64-NEXT:    move $a5, $a4
++; LA64-NEXT:    bgeu $a6, $a1, .LBB32_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB32_1 Depth=1
++; LA64-NEXT:    xor $a5, $a4, $a1
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:  .LBB32_3: # in Loop: Header=BB32_1 Depth=1
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB32_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw umax ptr %a, i8 %b acq_rel
++  ret i8 %1
++}
++
++define i16 @atomicrmw_umax_i16_acq_rel(ptr %a, i16 %b) nounwind {
++; LA64-LABEL: atomicrmw_umax_i16_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a6, $a4, $a3
++; LA64-NEXT:    move $a5, $a4
++; LA64-NEXT:    bgeu $a6, $a1, .LBB33_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB33_1 Depth=1
++; LA64-NEXT:    xor $a5, $a4, $a1
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:  .LBB33_3: # in Loop: Header=BB33_1 Depth=1
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB33_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw umax ptr %a, i16 %b acq_rel
++  ret i16 %1
++}
++
++define i32 @atomicrmw_umax_i32_acq_rel(ptr %a, i32 %b) nounwind {
++; LA64-LABEL: atomicrmw_umax_i32_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammax_db.wu $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw umax ptr %a, i32 %b acq_rel
++  ret i32 %1
++}
++
++define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
++; LA64-LABEL: atomicrmw_umax_i64_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammax_db.du $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw umax ptr %a, i64 %b acq_rel
++  ret i64 %1
++}
++
++define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
++; LA64-LABEL: atomicrmw_umin_i8_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB36_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a6, $a4, $a3
++; LA64-NEXT:    move $a5, $a4
++; LA64-NEXT:    bgeu $a1, $a6, .LBB36_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB36_1 Depth=1
++; LA64-NEXT:    xor $a5, $a4, $a1
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:  .LBB36_3: # in Loop: Header=BB36_1 Depth=1
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB36_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw umin ptr %a, i8 %b acq_rel
++  ret i8 %1
++}
++
++define i16 @atomicrmw_umin_i16_acq_rel(ptr %a, i16 %b) nounwind {
++; LA64-LABEL: atomicrmw_umin_i16_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB37_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a6, $a4, $a3
++; LA64-NEXT:    move $a5, $a4
++; LA64-NEXT:    bgeu $a1, $a6, .LBB37_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB37_1 Depth=1
++; LA64-NEXT:    xor $a5, $a4, $a1
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:  .LBB37_3: # in Loop: Header=BB37_1 Depth=1
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB37_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw umin ptr %a, i16 %b acq_rel
++  ret i16 %1
++}
++
++define i32 @atomicrmw_umin_i32_acq_rel(ptr %a, i32 %b) nounwind {
++; LA64-LABEL: atomicrmw_umin_i32_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammin_db.wu $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw umin ptr %a, i32 %b acq_rel
++  ret i32 %1
++}
++
++define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
++; LA64-LABEL: atomicrmw_umin_i64_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammin_db.du $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw umin ptr %a, i64 %b acq_rel
++  ret i64 %1
++}
++
++define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
++; LA64-LABEL: atomicrmw_max_i8_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    ext.w.b $a1, $a1
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:    andi $a4, $a0, 24
++; LA64-NEXT:    xori $a4, $a4, 56
++; LA64-NEXT:  .LBB40_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a2, 0
++; LA64-NEXT:    and $a7, $a5, $a3
++; LA64-NEXT:    move $a6, $a5
++; LA64-NEXT:    sll.w $a7, $a7, $a4
++; LA64-NEXT:    sra.w $a7, $a7, $a4
++; LA64-NEXT:    bge $a7, $a1, .LBB40_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB40_1 Depth=1
++; LA64-NEXT:    xor $a6, $a5, $a1
++; LA64-NEXT:    and $a6, $a6, $a3
++; LA64-NEXT:    xor $a6, $a5, $a6
++; LA64-NEXT:  .LBB40_3: # in Loop: Header=BB40_1 Depth=1
++; LA64-NEXT:    sc.w $a6, $a2, 0
++; LA64-NEXT:    beqz $a6, .LBB40_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw max ptr %a, i8 %b acq_rel
++  ret i8 %1
++}
++
++define i16 @atomicrmw_max_i16_acq_rel(ptr %a, i16 %b) nounwind {
++; LA64-LABEL: atomicrmw_max_i16_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    andi $a3, $a0, 24
++; LA64-NEXT:    ori $a4, $zero, 48
++; LA64-NEXT:    sub.d $a3, $a4, $a3
++; LA64-NEXT:    lu12i.w $a4, 15
++; LA64-NEXT:    ori $a4, $a4, 4095
++; LA64-NEXT:    sll.w $a4, $a4, $a0
++; LA64-NEXT:    addi.w $a4, $a4, 0
++; LA64-NEXT:    ext.w.h $a1, $a1
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB41_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a2, 0
++; LA64-NEXT:    and $a7, $a5, $a4
++; LA64-NEXT:    move $a6, $a5
++; LA64-NEXT:    sll.w $a7, $a7, $a3
++; LA64-NEXT:    sra.w $a7, $a7, $a3
++; LA64-NEXT:    bge $a7, $a1, .LBB41_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB41_1 Depth=1
++; LA64-NEXT:    xor $a6, $a5, $a1
++; LA64-NEXT:    and $a6, $a6, $a4
++; LA64-NEXT:    xor $a6, $a5, $a6
++; LA64-NEXT:  .LBB41_3: # in Loop: Header=BB41_1 Depth=1
++; LA64-NEXT:    sc.w $a6, $a2, 0
++; LA64-NEXT:    beqz $a6, .LBB41_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw max ptr %a, i16 %b acq_rel
++  ret i16 %1
++}
++
++define i32 @atomicrmw_max_i32_acq_rel(ptr %a, i32 %b) nounwind {
++; LA64-LABEL: atomicrmw_max_i32_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammax_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw max ptr %a, i32 %b acq_rel
++  ret i32 %1
++}
++
++define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
++; LA64-LABEL: atomicrmw_max_i64_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammax_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw max ptr %a, i64 %b acq_rel
++  ret i64 %1
++}
++
++define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
++; LA64-LABEL: atomicrmw_min_i8_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    ext.w.b $a1, $a1
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:    andi $a4, $a0, 24
++; LA64-NEXT:    xori $a4, $a4, 56
++; LA64-NEXT:  .LBB44_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a2, 0
++; LA64-NEXT:    and $a7, $a5, $a3
++; LA64-NEXT:    move $a6, $a5
++; LA64-NEXT:    sll.w $a7, $a7, $a4
++; LA64-NEXT:    sra.w $a7, $a7, $a4
++; LA64-NEXT:    bge $a1, $a7, .LBB44_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB44_1 Depth=1
++; LA64-NEXT:    xor $a6, $a5, $a1
++; LA64-NEXT:    and $a6, $a6, $a3
++; LA64-NEXT:    xor $a6, $a5, $a6
++; LA64-NEXT:  .LBB44_3: # in Loop: Header=BB44_1 Depth=1
++; LA64-NEXT:    sc.w $a6, $a2, 0
++; LA64-NEXT:    beqz $a6, .LBB44_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw min ptr %a, i8 %b acq_rel
++  ret i8 %1
++}
++
++define i16 @atomicrmw_min_i16_acq_rel(ptr %a, i16 %b) nounwind {
++; LA64-LABEL: atomicrmw_min_i16_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    andi $a3, $a0, 24
++; LA64-NEXT:    ori $a4, $zero, 48
++; LA64-NEXT:    sub.d $a3, $a4, $a3
++; LA64-NEXT:    lu12i.w $a4, 15
++; LA64-NEXT:    ori $a4, $a4, 4095
++; LA64-NEXT:    sll.w $a4, $a4, $a0
++; LA64-NEXT:    addi.w $a4, $a4, 0
++; LA64-NEXT:    ext.w.h $a1, $a1
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB45_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a2, 0
++; LA64-NEXT:    and $a7, $a5, $a4
++; LA64-NEXT:    move $a6, $a5
++; LA64-NEXT:    sll.w $a7, $a7, $a3
++; LA64-NEXT:    sra.w $a7, $a7, $a3
++; LA64-NEXT:    bge $a1, $a7, .LBB45_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB45_1 Depth=1
++; LA64-NEXT:    xor $a6, $a5, $a1
++; LA64-NEXT:    and $a6, $a6, $a4
++; LA64-NEXT:    xor $a6, $a5, $a6
++; LA64-NEXT:  .LBB45_3: # in Loop: Header=BB45_1 Depth=1
++; LA64-NEXT:    sc.w $a6, $a2, 0
++; LA64-NEXT:    beqz $a6, .LBB45_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw min ptr %a, i16 %b acq_rel
++  ret i16 %1
++}
++
++define i32 @atomicrmw_min_i32_acq_rel(ptr %a, i32 %b) nounwind {
++; LA64-LABEL: atomicrmw_min_i32_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammin_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw min ptr %a, i32 %b acq_rel
++  ret i32 %1
++}
++
++define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
++; LA64-LABEL: atomicrmw_min_i64_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammin_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw min ptr %a, i64 %b acq_rel
++  ret i64 %1
++}
++
++define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind {
++; LA64-LABEL: atomicrmw_umax_i8_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a6, $a4, $a3
++; LA64-NEXT:    move $a5, $a4
++; LA64-NEXT:    bgeu $a6, $a1, .LBB48_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB48_1 Depth=1
++; LA64-NEXT:    xor $a5, $a4, $a1
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:  .LBB48_3: # in Loop: Header=BB48_1 Depth=1
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB48_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw umax ptr %a, i8 %b seq_cst
++  ret i8 %1
++}
++
++define i16 @atomicrmw_umax_i16_seq_cst(ptr %a, i16 %b) nounwind {
++; LA64-LABEL: atomicrmw_umax_i16_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a6, $a4, $a3
++; LA64-NEXT:    move $a5, $a4
++; LA64-NEXT:    bgeu $a6, $a1, .LBB49_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB49_1 Depth=1
++; LA64-NEXT:    xor $a5, $a4, $a1
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:  .LBB49_3: # in Loop: Header=BB49_1 Depth=1
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB49_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw umax ptr %a, i16 %b seq_cst
++  ret i16 %1
++}
++
++define i32 @atomicrmw_umax_i32_seq_cst(ptr %a, i32 %b) nounwind {
++; LA64-LABEL: atomicrmw_umax_i32_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammax_db.wu $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw umax ptr %a, i32 %b seq_cst
++  ret i32 %1
++}
++
++define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
++; LA64-LABEL: atomicrmw_umax_i64_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammax_db.du $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw umax ptr %a, i64 %b seq_cst
++  ret i64 %1
++}
++
++define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind {
++; LA64-LABEL: atomicrmw_umin_i8_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a6, $a4, $a3
++; LA64-NEXT:    move $a5, $a4
++; LA64-NEXT:    bgeu $a1, $a6, .LBB52_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB52_1 Depth=1
++; LA64-NEXT:    xor $a5, $a4, $a1
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:  .LBB52_3: # in Loop: Header=BB52_1 Depth=1
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB52_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw umin ptr %a, i8 %b seq_cst
++  ret i8 %1
++}
++
++define i16 @atomicrmw_umin_i16_seq_cst(ptr %a, i16 %b) nounwind {
++; LA64-LABEL: atomicrmw_umin_i16_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a6, $a4, $a3
++; LA64-NEXT:    move $a5, $a4
++; LA64-NEXT:    bgeu $a1, $a6, .LBB53_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB53_1 Depth=1
++; LA64-NEXT:    xor $a5, $a4, $a1
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:  .LBB53_3: # in Loop: Header=BB53_1 Depth=1
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB53_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw umin ptr %a, i16 %b seq_cst
++  ret i16 %1
++}
++
++define i32 @atomicrmw_umin_i32_seq_cst(ptr %a, i32 %b) nounwind {
++; LA64-LABEL: atomicrmw_umin_i32_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammin_db.wu $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw umin ptr %a, i32 %b seq_cst
++  ret i32 %1
++}
++
++define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
++; LA64-LABEL: atomicrmw_umin_i64_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammin_db.du $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw umin ptr %a, i64 %b seq_cst
++  ret i64 %1
++}
++
++define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind {
++; LA64-LABEL: atomicrmw_max_i8_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    ext.w.b $a1, $a1
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:    andi $a4, $a0, 24
++; LA64-NEXT:    xori $a4, $a4, 56
++; LA64-NEXT:  .LBB56_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a2, 0
++; LA64-NEXT:    and $a7, $a5, $a3
++; LA64-NEXT:    move $a6, $a5
++; LA64-NEXT:    sll.w $a7, $a7, $a4
++; LA64-NEXT:    sra.w $a7, $a7, $a4
++; LA64-NEXT:    bge $a7, $a1, .LBB56_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB56_1 Depth=1
++; LA64-NEXT:    xor $a6, $a5, $a1
++; LA64-NEXT:    and $a6, $a6, $a3
++; LA64-NEXT:    xor $a6, $a5, $a6
++; LA64-NEXT:  .LBB56_3: # in Loop: Header=BB56_1 Depth=1
++; LA64-NEXT:    sc.w $a6, $a2, 0
++; LA64-NEXT:    beqz $a6, .LBB56_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw max ptr %a, i8 %b seq_cst
++  ret i8 %1
++}
++
++define i16 @atomicrmw_max_i16_seq_cst(ptr %a, i16 %b) nounwind {
++; LA64-LABEL: atomicrmw_max_i16_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    andi $a3, $a0, 24
++; LA64-NEXT:    ori $a4, $zero, 48
++; LA64-NEXT:    sub.d $a3, $a4, $a3
++; LA64-NEXT:    lu12i.w $a4, 15
++; LA64-NEXT:    ori $a4, $a4, 4095
++; LA64-NEXT:    sll.w $a4, $a4, $a0
++; LA64-NEXT:    addi.w $a4, $a4, 0
++; LA64-NEXT:    ext.w.h $a1, $a1
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB57_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a2, 0
++; LA64-NEXT:    and $a7, $a5, $a4
++; LA64-NEXT:    move $a6, $a5
++; LA64-NEXT:    sll.w $a7, $a7, $a3
++; LA64-NEXT:    sra.w $a7, $a7, $a3
++; LA64-NEXT:    bge $a7, $a1, .LBB57_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB57_1 Depth=1
++; LA64-NEXT:    xor $a6, $a5, $a1
++; LA64-NEXT:    and $a6, $a6, $a4
++; LA64-NEXT:    xor $a6, $a5, $a6
++; LA64-NEXT:  .LBB57_3: # in Loop: Header=BB57_1 Depth=1
++; LA64-NEXT:    sc.w $a6, $a2, 0
++; LA64-NEXT:    beqz $a6, .LBB57_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw max ptr %a, i16 %b seq_cst
++  ret i16 %1
++}
++
++define i32 @atomicrmw_max_i32_seq_cst(ptr %a, i32 %b) nounwind {
++; LA64-LABEL: atomicrmw_max_i32_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammax_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw max ptr %a, i32 %b seq_cst
++  ret i32 %1
++}
++
++define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
++; LA64-LABEL: atomicrmw_max_i64_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammax_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw max ptr %a, i64 %b seq_cst
++  ret i64 %1
++}
++
++define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind {
++; LA64-LABEL: atomicrmw_min_i8_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    ext.w.b $a1, $a1
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:    andi $a4, $a0, 24
++; LA64-NEXT:    xori $a4, $a4, 56
++; LA64-NEXT:  .LBB60_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a2, 0
++; LA64-NEXT:    and $a7, $a5, $a3
++; LA64-NEXT:    move $a6, $a5
++; LA64-NEXT:    sll.w $a7, $a7, $a4
++; LA64-NEXT:    sra.w $a7, $a7, $a4
++; LA64-NEXT:    bge $a1, $a7, .LBB60_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB60_1 Depth=1
++; LA64-NEXT:    xor $a6, $a5, $a1
++; LA64-NEXT:    and $a6, $a6, $a3
++; LA64-NEXT:    xor $a6, $a5, $a6
++; LA64-NEXT:  .LBB60_3: # in Loop: Header=BB60_1 Depth=1
++; LA64-NEXT:    sc.w $a6, $a2, 0
++; LA64-NEXT:    beqz $a6, .LBB60_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw min ptr %a, i8 %b seq_cst
++  ret i8 %1
++}
++
++define i16 @atomicrmw_min_i16_seq_cst(ptr %a, i16 %b) nounwind {
++; LA64-LABEL: atomicrmw_min_i16_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    andi $a3, $a0, 24
++; LA64-NEXT:    ori $a4, $zero, 48
++; LA64-NEXT:    sub.d $a3, $a4, $a3
++; LA64-NEXT:    lu12i.w $a4, 15
++; LA64-NEXT:    ori $a4, $a4, 4095
++; LA64-NEXT:    sll.w $a4, $a4, $a0
++; LA64-NEXT:    addi.w $a4, $a4, 0
++; LA64-NEXT:    ext.w.h $a1, $a1
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a2, 0
++; LA64-NEXT:    and $a7, $a5, $a4
++; LA64-NEXT:    move $a6, $a5
++; LA64-NEXT:    sll.w $a7, $a7, $a3
++; LA64-NEXT:    sra.w $a7, $a7, $a3
++; LA64-NEXT:    bge $a1, $a7, .LBB61_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB61_1 Depth=1
++; LA64-NEXT:    xor $a6, $a5, $a1
++; LA64-NEXT:    and $a6, $a6, $a4
++; LA64-NEXT:    xor $a6, $a5, $a6
++; LA64-NEXT:  .LBB61_3: # in Loop: Header=BB61_1 Depth=1
++; LA64-NEXT:    sc.w $a6, $a2, 0
++; LA64-NEXT:    beqz $a6, .LBB61_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw min ptr %a, i16 %b seq_cst
++  ret i16 %1
++}
++
++define i32 @atomicrmw_min_i32_seq_cst(ptr %a, i32 %b) nounwind {
++; LA64-LABEL: atomicrmw_min_i32_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammin_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw min ptr %a, i32 %b seq_cst
++  ret i32 %1
++}
++
++define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
++; LA64-LABEL: atomicrmw_min_i64_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammin_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw min ptr %a, i64 %b seq_cst
++  ret i64 %1
++}
++
++define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
++; LA64-LABEL: atomicrmw_umax_i8_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB64_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a6, $a4, $a3
++; LA64-NEXT:    move $a5, $a4
++; LA64-NEXT:    bgeu $a6, $a1, .LBB64_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB64_1 Depth=1
++; LA64-NEXT:    xor $a5, $a4, $a1
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:  .LBB64_3: # in Loop: Header=BB64_1 Depth=1
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB64_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw umax ptr %a, i8 %b monotonic
++  ret i8 %1
++}
++
++define i16 @atomicrmw_umax_i16_monotonic(ptr %a, i16 %b) nounwind {
++; LA64-LABEL: atomicrmw_umax_i16_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB65_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a6, $a4, $a3
++; LA64-NEXT:    move $a5, $a4
++; LA64-NEXT:    bgeu $a6, $a1, .LBB65_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB65_1 Depth=1
++; LA64-NEXT:    xor $a5, $a4, $a1
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:  .LBB65_3: # in Loop: Header=BB65_1 Depth=1
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB65_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw umax ptr %a, i16 %b monotonic
++  ret i16 %1
++}
++
++define i32 @atomicrmw_umax_i32_monotonic(ptr %a, i32 %b) nounwind {
++; LA64-LABEL: atomicrmw_umax_i32_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammax_db.wu $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw umax ptr %a, i32 %b monotonic
++  ret i32 %1
++}
++
++define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
++; LA64-LABEL: atomicrmw_umax_i64_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammax_db.du $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw umax ptr %a, i64 %b monotonic
++  ret i64 %1
++}
++
++define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
++; LA64-LABEL: atomicrmw_umin_i8_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB68_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a6, $a4, $a3
++; LA64-NEXT:    move $a5, $a4
++; LA64-NEXT:    bgeu $a1, $a6, .LBB68_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB68_1 Depth=1
++; LA64-NEXT:    xor $a5, $a4, $a1
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:  .LBB68_3: # in Loop: Header=BB68_1 Depth=1
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB68_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw umin ptr %a, i8 %b monotonic
++  ret i8 %1
++}
++
++define i16 @atomicrmw_umin_i16_monotonic(ptr %a, i16 %b) nounwind {
++; LA64-LABEL: atomicrmw_umin_i16_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB69_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a6, $a4, $a3
++; LA64-NEXT:    move $a5, $a4
++; LA64-NEXT:    bgeu $a1, $a6, .LBB69_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB69_1 Depth=1
++; LA64-NEXT:    xor $a5, $a4, $a1
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:  .LBB69_3: # in Loop: Header=BB69_1 Depth=1
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB69_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw umin ptr %a, i16 %b monotonic
++  ret i16 %1
++}
++
++define i32 @atomicrmw_umin_i32_monotonic(ptr %a, i32 %b) nounwind {
++; LA64-LABEL: atomicrmw_umin_i32_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammin_db.wu $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw umin ptr %a, i32 %b monotonic
++  ret i32 %1
++}
++
++define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
++; LA64-LABEL: atomicrmw_umin_i64_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammin_db.du $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw umin ptr %a, i64 %b monotonic
++  ret i64 %1
++}
++
++define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
++; LA64-LABEL: atomicrmw_max_i8_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    ext.w.b $a1, $a1
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:    andi $a4, $a0, 24
++; LA64-NEXT:    xori $a4, $a4, 56
++; LA64-NEXT:  .LBB72_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a2, 0
++; LA64-NEXT:    and $a7, $a5, $a3
++; LA64-NEXT:    move $a6, $a5
++; LA64-NEXT:    sll.w $a7, $a7, $a4
++; LA64-NEXT:    sra.w $a7, $a7, $a4
++; LA64-NEXT:    bge $a7, $a1, .LBB72_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB72_1 Depth=1
++; LA64-NEXT:    xor $a6, $a5, $a1
++; LA64-NEXT:    and $a6, $a6, $a3
++; LA64-NEXT:    xor $a6, $a5, $a6
++; LA64-NEXT:  .LBB72_3: # in Loop: Header=BB72_1 Depth=1
++; LA64-NEXT:    sc.w $a6, $a2, 0
++; LA64-NEXT:    beqz $a6, .LBB72_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw max ptr %a, i8 %b monotonic
++  ret i8 %1
++}
++
++define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
++; LA64-LABEL: atomicrmw_max_i16_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    andi $a3, $a0, 24
++; LA64-NEXT:    ori $a4, $zero, 48
++; LA64-NEXT:    sub.d $a3, $a4, $a3
++; LA64-NEXT:    lu12i.w $a4, 15
++; LA64-NEXT:    ori $a4, $a4, 4095
++; LA64-NEXT:    sll.w $a4, $a4, $a0
++; LA64-NEXT:    addi.w $a4, $a4, 0
++; LA64-NEXT:    ext.w.h $a1, $a1
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB73_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a2, 0
++; LA64-NEXT:    and $a7, $a5, $a4
++; LA64-NEXT:    move $a6, $a5
++; LA64-NEXT:    sll.w $a7, $a7, $a3
++; LA64-NEXT:    sra.w $a7, $a7, $a3
++; LA64-NEXT:    bge $a7, $a1, .LBB73_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB73_1 Depth=1
++; LA64-NEXT:    xor $a6, $a5, $a1
++; LA64-NEXT:    and $a6, $a6, $a4
++; LA64-NEXT:    xor $a6, $a5, $a6
++; LA64-NEXT:  .LBB73_3: # in Loop: Header=BB73_1 Depth=1
++; LA64-NEXT:    sc.w $a6, $a2, 0
++; LA64-NEXT:    beqz $a6, .LBB73_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw max ptr %a, i16 %b monotonic
++  ret i16 %1
++}
++
++define i32 @atomicrmw_max_i32_monotonic(ptr %a, i32 %b) nounwind {
++; LA64-LABEL: atomicrmw_max_i32_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammax_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw max ptr %a, i32 %b monotonic
++  ret i32 %1
++}
++
++define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
++; LA64-LABEL: atomicrmw_max_i64_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammax_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw max ptr %a, i64 %b monotonic
++  ret i64 %1
++}
++
++define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
++; LA64-LABEL: atomicrmw_min_i8_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    ext.w.b $a1, $a1
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:    andi $a4, $a0, 24
++; LA64-NEXT:    xori $a4, $a4, 56
++; LA64-NEXT:  .LBB76_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a2, 0
++; LA64-NEXT:    and $a7, $a5, $a3
++; LA64-NEXT:    move $a6, $a5
++; LA64-NEXT:    sll.w $a7, $a7, $a4
++; LA64-NEXT:    sra.w $a7, $a7, $a4
++; LA64-NEXT:    bge $a1, $a7, .LBB76_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB76_1 Depth=1
++; LA64-NEXT:    xor $a6, $a5, $a1
++; LA64-NEXT:    and $a6, $a6, $a3
++; LA64-NEXT:    xor $a6, $a5, $a6
++; LA64-NEXT:  .LBB76_3: # in Loop: Header=BB76_1 Depth=1
++; LA64-NEXT:    sc.w $a6, $a2, 0
++; LA64-NEXT:    beqz $a6, .LBB76_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw min ptr %a, i8 %b monotonic
++  ret i8 %1
++}
++
++define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
++; LA64-LABEL: atomicrmw_min_i16_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    andi $a3, $a0, 24
++; LA64-NEXT:    ori $a4, $zero, 48
++; LA64-NEXT:    sub.d $a3, $a4, $a3
++; LA64-NEXT:    lu12i.w $a4, 15
++; LA64-NEXT:    ori $a4, $a4, 4095
++; LA64-NEXT:    sll.w $a4, $a4, $a0
++; LA64-NEXT:    addi.w $a4, $a4, 0
++; LA64-NEXT:    ext.w.h $a1, $a1
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB77_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a5, $a2, 0
++; LA64-NEXT:    and $a7, $a5, $a4
++; LA64-NEXT:    move $a6, $a5
++; LA64-NEXT:    sll.w $a7, $a7, $a3
++; LA64-NEXT:    sra.w $a7, $a7, $a3
++; LA64-NEXT:    bge $a1, $a7, .LBB77_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB77_1 Depth=1
++; LA64-NEXT:    xor $a6, $a5, $a1
++; LA64-NEXT:    and $a6, $a6, $a4
++; LA64-NEXT:    xor $a6, $a5, $a6
++; LA64-NEXT:  .LBB77_3: # in Loop: Header=BB77_1 Depth=1
++; LA64-NEXT:    sc.w $a6, $a2, 0
++; LA64-NEXT:    beqz $a6, .LBB77_1
++; LA64-NEXT:  # %bb.4:
++; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw min ptr %a, i16 %b monotonic
++  ret i16 %1
++}
++
++define i32 @atomicrmw_min_i32_monotonic(ptr %a, i32 %b) nounwind {
++; LA64-LABEL: atomicrmw_min_i32_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammin_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw min ptr %a, i32 %b monotonic
++  ret i32 %1
++}
++
++define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
++; LA64-LABEL: atomicrmw_min_i64_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ammin_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw min ptr %a, i64 %b monotonic
++  ret i64 %1
++}
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll
+index 626276ba05f7..94a26e4ed9c7 100644
+--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll
+@@ -900,6 +900,3228 @@ define i64 @atomicrmw_xor_i64_acquire(ptr %a, i64 %b) nounwind {
+   ret i64 %1
+ }
+ 
++define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_xchg_i8_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a3, $zero, 255
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB28_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    addi.w $a5, $a1, 0
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB28_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_i8_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB28_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    addi.w $a5, $a1, 0
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB28_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i8 %b release
++  ret i8 %1
++}
++
++define i8 @atomicrmw_xchg_0_i8_release(ptr %a) nounwind {
++; LA32-LABEL: atomicrmw_xchg_0_i8_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a1, $zero, -4
++; LA32-NEXT:    and $a1, $a0, $a1
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a2, $zero, 255
++; LA32-NEXT:    sll.w $a2, $a2, $a0
++; LA32-NEXT:  .LBB29_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a1, 0
++; LA32-NEXT:    addi.w $a4, $zero, 0
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    and $a4, $a4, $a2
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    sc.w $a4, $a1, 0
++; LA32-NEXT:    beqz $a4, .LBB29_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_0_i8_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a1, $zero, -4
++; LA64-NEXT:    and $a1, $a0, $a1
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a2, $zero, 255
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:  .LBB29_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a3, $a1, 0
++; LA64-NEXT:    addi.w $a4, $zero, 0
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    and $a4, $a4, $a2
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    sc.w $a4, $a1, 0
++; LA64-NEXT:    beqz $a4, .LBB29_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i8 0 release
++  ret i8 %1
++}
++
++define i8 @atomicrmw_xchg_minus_1_i8_release(ptr %a) nounwind {
++; LA32-LABEL: atomicrmw_xchg_minus_1_i8_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a1, $zero, -4
++; LA32-NEXT:    and $a1, $a0, $a1
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a2, $zero, 255
++; LA32-NEXT:    sll.w $a2, $a2, $a0
++; LA32-NEXT:  .LBB30_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a1, 0
++; LA32-NEXT:    addi.w $a4, $a2, 0
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    and $a4, $a4, $a2
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    sc.w $a4, $a1, 0
++; LA32-NEXT:    beqz $a4, .LBB30_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_minus_1_i8_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a1, $zero, -4
++; LA64-NEXT:    and $a1, $a0, $a1
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a2, $zero, 255
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:  .LBB30_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a3, $a1, 0
++; LA64-NEXT:    addi.w $a4, $a2, 0
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    and $a4, $a4, $a2
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    sc.w $a4, $a1, 0
++; LA64-NEXT:    beqz $a4, .LBB30_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i8 -1 release
++  ret i8 %1
++}
++
++define i16 @atomicrmw_xchg_i16_release(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_xchg_i16_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a3, 15
++; LA32-NEXT:    ori $a3, $a3, 4095
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    addi.w $a5, $a1, 0
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB31_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_i16_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    addi.w $a5, $a1, 0
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB31_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i16 %b release
++  ret i16 %1
++}
++
++define i16 @atomicrmw_xchg_0_i16_release(ptr %a) nounwind {
++; LA32-LABEL: atomicrmw_xchg_0_i16_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a1, $zero, -4
++; LA32-NEXT:    and $a1, $a0, $a1
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a2, 15
++; LA32-NEXT:    ori $a2, $a2, 4095
++; LA32-NEXT:    sll.w $a2, $a2, $a0
++; LA32-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a1, 0
++; LA32-NEXT:    addi.w $a4, $zero, 0
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    and $a4, $a4, $a2
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    sc.w $a4, $a1, 0
++; LA32-NEXT:    beqz $a4, .LBB32_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_0_i16_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a1, $zero, -4
++; LA64-NEXT:    and $a1, $a0, $a1
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a2, 15
++; LA64-NEXT:    ori $a2, $a2, 4095
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a3, $a1, 0
++; LA64-NEXT:    addi.w $a4, $zero, 0
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    and $a4, $a4, $a2
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    sc.w $a4, $a1, 0
++; LA64-NEXT:    beqz $a4, .LBB32_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i16 0 release
++  ret i16 %1
++}
++
++define i16 @atomicrmw_xchg_minus_1_i16_release(ptr %a) nounwind {
++; LA32-LABEL: atomicrmw_xchg_minus_1_i16_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a1, $zero, -4
++; LA32-NEXT:    and $a1, $a0, $a1
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a2, 15
++; LA32-NEXT:    ori $a2, $a2, 4095
++; LA32-NEXT:    sll.w $a2, $a2, $a0
++; LA32-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a1, 0
++; LA32-NEXT:    addi.w $a4, $a2, 0
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    and $a4, $a4, $a2
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    sc.w $a4, $a1, 0
++; LA32-NEXT:    beqz $a4, .LBB33_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_minus_1_i16_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a1, $zero, -4
++; LA64-NEXT:    and $a1, $a0, $a1
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a2, 15
++; LA64-NEXT:    ori $a2, $a2, 4095
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a3, $a1, 0
++; LA64-NEXT:    addi.w $a4, $a2, 0
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    and $a4, $a4, $a2
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    sc.w $a4, $a1, 0
++; LA64-NEXT:    beqz $a4, .LBB33_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i16 -1 release
++  ret i16 %1
++}
++
++define i32 @atomicrmw_xchg_i32_release(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_xchg_i32_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB34_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    move $a3, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB34_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_i32_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amswap_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i32 %b release
++  ret i32 %1
++}
++
++define i64 @atomicrmw_xchg_i64_release(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_xchg_i64_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 3
++; LA32-NEXT:    bl %plt(__atomic_exchange_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_i64_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amswap_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i64 %b release
++  ret i64 %1
++}
++
++define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_add_i8_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a3, $zero, 255
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB36_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    add.w $a5, $a4, $a1
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB36_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_add_i8_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB36_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    add.w $a5, $a4, $a1
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB36_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw add ptr %a, i8 %b release
++  ret i8 %1
++}
++
++define i16 @atomicrmw_add_i16_release(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_add_i16_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a3, 15
++; LA32-NEXT:    ori $a3, $a3, 4095
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB37_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    add.w $a5, $a4, $a1
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB37_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_add_i16_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB37_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    add.w $a5, $a4, $a1
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB37_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw add ptr %a, i16 %b release
++  ret i16 %1
++}
++
++define i32 @atomicrmw_add_i32_release(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_add_i32_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB38_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    add.w $a3, $a2, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB38_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_add_i32_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amadd_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw add ptr %a, i32 %b release
++  ret i32 %1
++}
++
++define i64 @atomicrmw_add_i64_release(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_add_i64_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 3
++; LA32-NEXT:    bl %plt(__atomic_fetch_add_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_add_i64_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amadd_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw add ptr %a, i64 %b release
++  ret i64 %1
++}
++
++define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_sub_i8_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a3, $zero, 255
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB40_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    sub.w $a5, $a4, $a1
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB40_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_sub_i8_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB40_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    sub.w $a5, $a4, $a1
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB40_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw sub ptr %a, i8 %b release
++  ret i8 %1
++}
++
++define i16 @atomicrmw_sub_i16_release(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_sub_i16_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a3, 15
++; LA32-NEXT:    ori $a3, $a3, 4095
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB41_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    sub.w $a5, $a4, $a1
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB41_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_sub_i16_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB41_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    sub.w $a5, $a4, $a1
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB41_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw sub ptr %a, i16 %b release
++  ret i16 %1
++}
++
++define i32 @atomicrmw_sub_i32_release(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_sub_i32_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB42_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    sub.w $a3, $a2, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB42_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_sub_i32_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    sub.w $a2, $zero, $a1
++; LA64-NEXT:    amadd_db.w $a1, $a2, $a0
++; LA64-NEXT:    move $a0, $a1
++; LA64-NEXT:    ret
++  %1 = atomicrmw sub ptr %a, i32 %b release
++  ret i32 %1
++}
++
++define i64 @atomicrmw_sub_i64_release(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_sub_i64_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 3
++; LA32-NEXT:    bl %plt(__atomic_fetch_sub_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_sub_i64_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    sub.d $a2, $zero, $a1
++; LA64-NEXT:    amadd_db.d $a1, $a2, $a0
++; LA64-NEXT:    move $a0, $a1
++; LA64-NEXT:    ret
++  %1 = atomicrmw sub ptr %a, i64 %b release
++  ret i64 %1
++}
++
++define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_nand_i8_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a3, $zero, 255
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB44_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    and $a5, $a4, $a1
++; LA32-NEXT:    nor $a5, $a5, $zero
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB44_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_nand_i8_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB44_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a5, $a4, $a1
++; LA64-NEXT:    nor $a5, $a5, $zero
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB44_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw nand ptr %a, i8 %b release
++  ret i8 %1
++}
++
++define i16 @atomicrmw_nand_i16_release(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_nand_i16_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a3, 15
++; LA32-NEXT:    ori $a3, $a3, 4095
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB45_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    and $a5, $a4, $a1
++; LA32-NEXT:    nor $a5, $a5, $zero
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB45_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_nand_i16_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB45_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a5, $a4, $a1
++; LA64-NEXT:    nor $a5, $a5, $zero
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB45_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw nand ptr %a, i16 %b release
++  ret i16 %1
++}
++
++define i32 @atomicrmw_nand_i32_release(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_nand_i32_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    and $a3, $a2, $a1
++; LA32-NEXT:    nor $a3, $a3, $zero
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB46_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_nand_i32_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a2, $a0, 0
++; LA64-NEXT:    and $a3, $a2, $a1
++; LA64-NEXT:    nor $a3, $a3, $zero
++; LA64-NEXT:    sc.w $a3, $a0, 0
++; LA64-NEXT:    beqz $a3, .LBB46_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw nand ptr %a, i32 %b release
++  ret i32 %1
++}
++
++define i64 @atomicrmw_nand_i64_release(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_nand_i64_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 3
++; LA32-NEXT:    bl %plt(__atomic_fetch_nand_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_nand_i64_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.d $a2, $a0, 0
++; LA64-NEXT:    and $a3, $a2, $a1
++; LA64-NEXT:    nor $a3, $a3, $zero
++; LA64-NEXT:    sc.d $a3, $a0, 0
++; LA64-NEXT:    beqz $a3, .LBB47_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw nand ptr %a, i64 %b release
++  ret i64 %1
++}
++
++define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_and_i8_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    slli.w $a2, $a0, 3
++; LA32-NEXT:    ori $a3, $zero, 255
++; LA32-NEXT:    sll.w $a3, $a3, $a2
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a2
++; LA32-NEXT:    orn $a1, $a1, $a3
++; LA32-NEXT:    addi.w $a3, $zero, -4
++; LA32-NEXT:    and $a0, $a0, $a3
++; LA32-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a0, 0
++; LA32-NEXT:    and $a4, $a3, $a1
++; LA32-NEXT:    sc.w $a4, $a0, 0
++; LA32-NEXT:    beqz $a4, .LBB48_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_and_i8_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    slli.d $a2, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a2
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a2
++; LA64-NEXT:    orn $a1, $a1, $a3
++; LA64-NEXT:    addi.w $a3, $zero, -4
++; LA64-NEXT:    and $a0, $a0, $a3
++; LA64-NEXT:    amand_db.w $a3, $a1, $a0
++; LA64-NEXT:    srl.w $a0, $a3, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw and ptr %a, i8 %b release
++  ret i8 %1
++}
++
++define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_and_i16_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    lu12i.w $a2, 15
++; LA32-NEXT:    ori $a2, $a2, 4095
++; LA32-NEXT:    slli.w $a3, $a0, 3
++; LA32-NEXT:    sll.w $a2, $a2, $a3
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a3
++; LA32-NEXT:    orn $a1, $a1, $a2
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a0, $a0, $a2
++; LA32-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    and $a4, $a2, $a1
++; LA32-NEXT:    sc.w $a4, $a0, 0
++; LA32-NEXT:    beqz $a4, .LBB49_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a2, $a3
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_and_i16_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    lu12i.w $a2, 15
++; LA64-NEXT:    ori $a2, $a2, 4095
++; LA64-NEXT:    slli.d $a3, $a0, 3
++; LA64-NEXT:    sll.w $a2, $a2, $a3
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a3
++; LA64-NEXT:    orn $a1, $a1, $a2
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a0, $a0, $a2
++; LA64-NEXT:    amand_db.w $a2, $a1, $a0
++; LA64-NEXT:    srl.w $a0, $a2, $a3
++; LA64-NEXT:    ret
++  %1 = atomicrmw and ptr %a, i16 %b release
++  ret i16 %1
++}
++
++define i32 @atomicrmw_and_i32_release(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_and_i32_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB50_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    and $a3, $a2, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB50_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_and_i32_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amand_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw and ptr %a, i32 %b release
++  ret i32 %1
++}
++
++define i64 @atomicrmw_and_i64_release(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_and_i64_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 3
++; LA32-NEXT:    bl %plt(__atomic_fetch_and_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_and_i64_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amand_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw and ptr %a, i64 %b release
++  ret i64 %1
++}
++
++define i8 @atomicrmw_or_i8_release(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_or_i8_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a2, 0
++; LA32-NEXT:    or $a4, $a3, $a1
++; LA32-NEXT:    sc.w $a4, $a2, 0
++; LA32-NEXT:    beqz $a4, .LBB52_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_or_i8_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    amor_db.w $a3, $a1, $a2
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw or ptr %a, i8 %b release
++  ret i8 %1
++}
++
++define i16 @atomicrmw_or_i16_release(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_or_i16_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a2, 0
++; LA32-NEXT:    or $a4, $a3, $a1
++; LA32-NEXT:    sc.w $a4, $a2, 0
++; LA32-NEXT:    beqz $a4, .LBB53_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_or_i16_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    amor_db.w $a3, $a1, $a2
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw or ptr %a, i16 %b release
++  ret i16 %1
++}
++
++define i32 @atomicrmw_or_i32_release(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_or_i32_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB54_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    or $a3, $a2, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB54_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_or_i32_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amor_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw or ptr %a, i32 %b release
++  ret i32 %1
++}
++
++define i64 @atomicrmw_or_i64_release(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_or_i64_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 3
++; LA32-NEXT:    bl %plt(__atomic_fetch_or_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_or_i64_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amor_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw or ptr %a, i64 %b release
++  ret i64 %1
++}
++
++define i8 @atomicrmw_xor_i8_release(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_xor_i8_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB56_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a2, 0
++; LA32-NEXT:    xor $a4, $a3, $a1
++; LA32-NEXT:    sc.w $a4, $a2, 0
++; LA32-NEXT:    beqz $a4, .LBB56_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xor_i8_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    amxor_db.w $a3, $a1, $a2
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xor ptr %a, i8 %b release
++  ret i8 %1
++}
++
++define i16 @atomicrmw_xor_i16_release(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_xor_i16_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB57_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a2, 0
++; LA32-NEXT:    xor $a4, $a3, $a1
++; LA32-NEXT:    sc.w $a4, $a2, 0
++; LA32-NEXT:    beqz $a4, .LBB57_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xor_i16_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    amxor_db.w $a3, $a1, $a2
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xor ptr %a, i16 %b release
++  ret i16 %1
++}
++
++define i32 @atomicrmw_xor_i32_release(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_xor_i32_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB58_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    xor $a3, $a2, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB58_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xor_i32_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amxor_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw xor ptr %a, i32 %b release
++  ret i32 %1
++}
++
++define i64 @atomicrmw_xor_i64_release(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_xor_i64_release:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 3
++; LA32-NEXT:    bl %plt(__atomic_fetch_xor_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xor_i64_release:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amxor_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw xor ptr %a, i64 %b release
++  ret i64 %1
++}
++
++define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_xchg_i8_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a3, $zero, 255
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB60_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    addi.w $a5, $a1, 0
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB60_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_i8_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB60_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    addi.w $a5, $a1, 0
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB60_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i8 %b acq_rel
++  ret i8 %1
++}
++
++define i8 @atomicrmw_xchg_0_i8_acq_rel(ptr %a) nounwind {
++; LA32-LABEL: atomicrmw_xchg_0_i8_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a1, $zero, -4
++; LA32-NEXT:    and $a1, $a0, $a1
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a2, $zero, 255
++; LA32-NEXT:    sll.w $a2, $a2, $a0
++; LA32-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a1, 0
++; LA32-NEXT:    addi.w $a4, $zero, 0
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    and $a4, $a4, $a2
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    sc.w $a4, $a1, 0
++; LA32-NEXT:    beqz $a4, .LBB61_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_0_i8_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a1, $zero, -4
++; LA64-NEXT:    and $a1, $a0, $a1
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a2, $zero, 255
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a3, $a1, 0
++; LA64-NEXT:    addi.w $a4, $zero, 0
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    and $a4, $a4, $a2
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    sc.w $a4, $a1, 0
++; LA64-NEXT:    beqz $a4, .LBB61_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i8 0 acq_rel
++  ret i8 %1
++}
++
++define i8 @atomicrmw_xchg_minus_1_i8_acq_rel(ptr %a) nounwind {
++; LA32-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a1, $zero, -4
++; LA32-NEXT:    and $a1, $a0, $a1
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a2, $zero, 255
++; LA32-NEXT:    sll.w $a2, $a2, $a0
++; LA32-NEXT:  .LBB62_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a1, 0
++; LA32-NEXT:    addi.w $a4, $a2, 0
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    and $a4, $a4, $a2
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    sc.w $a4, $a1, 0
++; LA32-NEXT:    beqz $a4, .LBB62_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_minus_1_i8_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a1, $zero, -4
++; LA64-NEXT:    and $a1, $a0, $a1
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a2, $zero, 255
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:  .LBB62_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a3, $a1, 0
++; LA64-NEXT:    addi.w $a4, $a2, 0
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    and $a4, $a4, $a2
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    sc.w $a4, $a1, 0
++; LA64-NEXT:    beqz $a4, .LBB62_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i8 -1 acq_rel
++  ret i8 %1
++}
++
++define i16 @atomicrmw_xchg_i16_acq_rel(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_xchg_i16_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a3, 15
++; LA32-NEXT:    ori $a3, $a3, 4095
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB63_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    addi.w $a5, $a1, 0
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB63_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_i16_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB63_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    addi.w $a5, $a1, 0
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB63_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i16 %b acq_rel
++  ret i16 %1
++}
++
++define i16 @atomicrmw_xchg_0_i16_acq_rel(ptr %a) nounwind {
++; LA32-LABEL: atomicrmw_xchg_0_i16_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a1, $zero, -4
++; LA32-NEXT:    and $a1, $a0, $a1
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a2, 15
++; LA32-NEXT:    ori $a2, $a2, 4095
++; LA32-NEXT:    sll.w $a2, $a2, $a0
++; LA32-NEXT:  .LBB64_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a1, 0
++; LA32-NEXT:    addi.w $a4, $zero, 0
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    and $a4, $a4, $a2
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    sc.w $a4, $a1, 0
++; LA32-NEXT:    beqz $a4, .LBB64_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_0_i16_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a1, $zero, -4
++; LA64-NEXT:    and $a1, $a0, $a1
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a2, 15
++; LA64-NEXT:    ori $a2, $a2, 4095
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:  .LBB64_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a3, $a1, 0
++; LA64-NEXT:    addi.w $a4, $zero, 0
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    and $a4, $a4, $a2
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    sc.w $a4, $a1, 0
++; LA64-NEXT:    beqz $a4, .LBB64_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i16 0 acq_rel
++  ret i16 %1
++}
++
++define i16 @atomicrmw_xchg_minus_1_i16_acq_rel(ptr %a) nounwind {
++; LA32-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a1, $zero, -4
++; LA32-NEXT:    and $a1, $a0, $a1
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a2, 15
++; LA32-NEXT:    ori $a2, $a2, 4095
++; LA32-NEXT:    sll.w $a2, $a2, $a0
++; LA32-NEXT:  .LBB65_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a1, 0
++; LA32-NEXT:    addi.w $a4, $a2, 0
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    and $a4, $a4, $a2
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    sc.w $a4, $a1, 0
++; LA32-NEXT:    beqz $a4, .LBB65_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_minus_1_i16_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a1, $zero, -4
++; LA64-NEXT:    and $a1, $a0, $a1
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a2, 15
++; LA64-NEXT:    ori $a2, $a2, 4095
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:  .LBB65_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a3, $a1, 0
++; LA64-NEXT:    addi.w $a4, $a2, 0
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    and $a4, $a4, $a2
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    sc.w $a4, $a1, 0
++; LA64-NEXT:    beqz $a4, .LBB65_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i16 -1 acq_rel
++  ret i16 %1
++}
++
++define i32 @atomicrmw_xchg_i32_acq_rel(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_xchg_i32_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB66_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    move $a3, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB66_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_i32_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amswap_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i32 %b acq_rel
++  ret i32 %1
++}
++
++define i64 @atomicrmw_xchg_i64_acq_rel(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_xchg_i64_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 4
++; LA32-NEXT:    bl %plt(__atomic_exchange_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_i64_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amswap_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i64 %b acq_rel
++  ret i64 %1
++}
++
++define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_add_i8_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a3, $zero, 255
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB68_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    add.w $a5, $a4, $a1
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB68_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_add_i8_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB68_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    add.w $a5, $a4, $a1
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB68_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw add ptr %a, i8 %b acq_rel
++  ret i8 %1
++}
++
++define i16 @atomicrmw_add_i16_acq_rel(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_add_i16_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a3, 15
++; LA32-NEXT:    ori $a3, $a3, 4095
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB69_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    add.w $a5, $a4, $a1
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB69_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_add_i16_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB69_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    add.w $a5, $a4, $a1
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB69_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw add ptr %a, i16 %b acq_rel
++  ret i16 %1
++}
++
++define i32 @atomicrmw_add_i32_acq_rel(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_add_i32_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB70_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    add.w $a3, $a2, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB70_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_add_i32_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amadd_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw add ptr %a, i32 %b acq_rel
++  ret i32 %1
++}
++
++define i64 @atomicrmw_add_i64_acq_rel(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_add_i64_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 4
++; LA32-NEXT:    bl %plt(__atomic_fetch_add_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_add_i64_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amadd_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw add ptr %a, i64 %b acq_rel
++  ret i64 %1
++}
++
++define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_sub_i8_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a3, $zero, 255
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB72_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    sub.w $a5, $a4, $a1
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB72_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_sub_i8_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB72_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    sub.w $a5, $a4, $a1
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB72_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw sub ptr %a, i8 %b acq_rel
++  ret i8 %1
++}
++
++define i16 @atomicrmw_sub_i16_acq_rel(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_sub_i16_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a3, 15
++; LA32-NEXT:    ori $a3, $a3, 4095
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB73_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    sub.w $a5, $a4, $a1
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB73_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_sub_i16_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB73_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    sub.w $a5, $a4, $a1
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB73_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw sub ptr %a, i16 %b acq_rel
++  ret i16 %1
++}
++
++define i32 @atomicrmw_sub_i32_acq_rel(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_sub_i32_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB74_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    sub.w $a3, $a2, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB74_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_sub_i32_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    sub.w $a2, $zero, $a1
++; LA64-NEXT:    amadd_db.w $a1, $a2, $a0
++; LA64-NEXT:    move $a0, $a1
++; LA64-NEXT:    ret
++  %1 = atomicrmw sub ptr %a, i32 %b acq_rel
++  ret i32 %1
++}
++
++define i64 @atomicrmw_sub_i64_acq_rel(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_sub_i64_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 4
++; LA32-NEXT:    bl %plt(__atomic_fetch_sub_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_sub_i64_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    sub.d $a2, $zero, $a1
++; LA64-NEXT:    amadd_db.d $a1, $a2, $a0
++; LA64-NEXT:    move $a0, $a1
++; LA64-NEXT:    ret
++  %1 = atomicrmw sub ptr %a, i64 %b acq_rel
++  ret i64 %1
++}
++
++define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_nand_i8_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a3, $zero, 255
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB76_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    and $a5, $a4, $a1
++; LA32-NEXT:    nor $a5, $a5, $zero
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB76_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_nand_i8_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB76_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a5, $a4, $a1
++; LA64-NEXT:    nor $a5, $a5, $zero
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB76_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw nand ptr %a, i8 %b acq_rel
++  ret i8 %1
++}
++
++define i16 @atomicrmw_nand_i16_acq_rel(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_nand_i16_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a3, 15
++; LA32-NEXT:    ori $a3, $a3, 4095
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB77_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    and $a5, $a4, $a1
++; LA32-NEXT:    nor $a5, $a5, $zero
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB77_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_nand_i16_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB77_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a5, $a4, $a1
++; LA64-NEXT:    nor $a5, $a5, $zero
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB77_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw nand ptr %a, i16 %b acq_rel
++  ret i16 %1
++}
++
++define i32 @atomicrmw_nand_i32_acq_rel(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_nand_i32_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB78_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    and $a3, $a2, $a1
++; LA32-NEXT:    nor $a3, $a3, $zero
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB78_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_nand_i32_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:  .LBB78_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a2, $a0, 0
++; LA64-NEXT:    and $a3, $a2, $a1
++; LA64-NEXT:    nor $a3, $a3, $zero
++; LA64-NEXT:    sc.w $a3, $a0, 0
++; LA64-NEXT:    beqz $a3, .LBB78_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw nand ptr %a, i32 %b acq_rel
++  ret i32 %1
++}
++
++define i64 @atomicrmw_nand_i64_acq_rel(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_nand_i64_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 4
++; LA32-NEXT:    bl %plt(__atomic_fetch_nand_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_nand_i64_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:  .LBB79_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.d $a2, $a0, 0
++; LA64-NEXT:    and $a3, $a2, $a1
++; LA64-NEXT:    nor $a3, $a3, $zero
++; LA64-NEXT:    sc.d $a3, $a0, 0
++; LA64-NEXT:    beqz $a3, .LBB79_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw nand ptr %a, i64 %b acq_rel
++  ret i64 %1
++}
++
++define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_and_i8_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    slli.w $a2, $a0, 3
++; LA32-NEXT:    ori $a3, $zero, 255
++; LA32-NEXT:    sll.w $a3, $a3, $a2
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a2
++; LA32-NEXT:    orn $a1, $a1, $a3
++; LA32-NEXT:    addi.w $a3, $zero, -4
++; LA32-NEXT:    and $a0, $a0, $a3
++; LA32-NEXT:  .LBB80_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a0, 0
++; LA32-NEXT:    and $a4, $a3, $a1
++; LA32-NEXT:    sc.w $a4, $a0, 0
++; LA32-NEXT:    beqz $a4, .LBB80_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_and_i8_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    slli.d $a2, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a2
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a2
++; LA64-NEXT:    orn $a1, $a1, $a3
++; LA64-NEXT:    addi.w $a3, $zero, -4
++; LA64-NEXT:    and $a0, $a0, $a3
++; LA64-NEXT:    amand_db.w $a3, $a1, $a0
++; LA64-NEXT:    srl.w $a0, $a3, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw and ptr %a, i8 %b acq_rel
++  ret i8 %1
++}
++
++define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_and_i16_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    lu12i.w $a2, 15
++; LA32-NEXT:    ori $a2, $a2, 4095
++; LA32-NEXT:    slli.w $a3, $a0, 3
++; LA32-NEXT:    sll.w $a2, $a2, $a3
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a3
++; LA32-NEXT:    orn $a1, $a1, $a2
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a0, $a0, $a2
++; LA32-NEXT:  .LBB81_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    and $a4, $a2, $a1
++; LA32-NEXT:    sc.w $a4, $a0, 0
++; LA32-NEXT:    beqz $a4, .LBB81_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a2, $a3
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_and_i16_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    lu12i.w $a2, 15
++; LA64-NEXT:    ori $a2, $a2, 4095
++; LA64-NEXT:    slli.d $a3, $a0, 3
++; LA64-NEXT:    sll.w $a2, $a2, $a3
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a3
++; LA64-NEXT:    orn $a1, $a1, $a2
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a0, $a0, $a2
++; LA64-NEXT:    amand_db.w $a2, $a1, $a0
++; LA64-NEXT:    srl.w $a0, $a2, $a3
++; LA64-NEXT:    ret
++  %1 = atomicrmw and ptr %a, i16 %b acq_rel
++  ret i16 %1
++}
++
++define i32 @atomicrmw_and_i32_acq_rel(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_and_i32_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB82_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    and $a3, $a2, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB82_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_and_i32_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amand_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw and ptr %a, i32 %b acq_rel
++  ret i32 %1
++}
++
++define i64 @atomicrmw_and_i64_acq_rel(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_and_i64_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 4
++; LA32-NEXT:    bl %plt(__atomic_fetch_and_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_and_i64_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amand_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw and ptr %a, i64 %b acq_rel
++  ret i64 %1
++}
++
++define i8 @atomicrmw_or_i8_acq_rel(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_or_i8_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB84_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a2, 0
++; LA32-NEXT:    or $a4, $a3, $a1
++; LA32-NEXT:    sc.w $a4, $a2, 0
++; LA32-NEXT:    beqz $a4, .LBB84_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_or_i8_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    amor_db.w $a3, $a1, $a2
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw or ptr %a, i8 %b acq_rel
++  ret i8 %1
++}
++
++define i16 @atomicrmw_or_i16_acq_rel(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_or_i16_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB85_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a2, 0
++; LA32-NEXT:    or $a4, $a3, $a1
++; LA32-NEXT:    sc.w $a4, $a2, 0
++; LA32-NEXT:    beqz $a4, .LBB85_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_or_i16_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    amor_db.w $a3, $a1, $a2
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw or ptr %a, i16 %b acq_rel
++  ret i16 %1
++}
++
++define i32 @atomicrmw_or_i32_acq_rel(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_or_i32_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB86_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    or $a3, $a2, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB86_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_or_i32_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amor_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw or ptr %a, i32 %b acq_rel
++  ret i32 %1
++}
++
++define i64 @atomicrmw_or_i64_acq_rel(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_or_i64_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 4
++; LA32-NEXT:    bl %plt(__atomic_fetch_or_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_or_i64_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amor_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw or ptr %a, i64 %b acq_rel
++  ret i64 %1
++}
++
++define i8 @atomicrmw_xor_i8_acq_rel(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_xor_i8_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB88_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a2, 0
++; LA32-NEXT:    xor $a4, $a3, $a1
++; LA32-NEXT:    sc.w $a4, $a2, 0
++; LA32-NEXT:    beqz $a4, .LBB88_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xor_i8_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    amxor_db.w $a3, $a1, $a2
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xor ptr %a, i8 %b acq_rel
++  ret i8 %1
++}
++
++define i16 @atomicrmw_xor_i16_acq_rel(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_xor_i16_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB89_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a2, 0
++; LA32-NEXT:    xor $a4, $a3, $a1
++; LA32-NEXT:    sc.w $a4, $a2, 0
++; LA32-NEXT:    beqz $a4, .LBB89_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xor_i16_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    amxor_db.w $a3, $a1, $a2
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xor ptr %a, i16 %b acq_rel
++  ret i16 %1
++}
++
++define i32 @atomicrmw_xor_i32_acq_rel(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_xor_i32_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB90_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    xor $a3, $a2, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB90_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xor_i32_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amxor_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw xor ptr %a, i32 %b acq_rel
++  ret i32 %1
++}
++
++define i64 @atomicrmw_xor_i64_acq_rel(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_xor_i64_acq_rel:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 4
++; LA32-NEXT:    bl %plt(__atomic_fetch_xor_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xor_i64_acq_rel:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amxor_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw xor ptr %a, i64 %b acq_rel
++  ret i64 %1
++}
++
++define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_xchg_i8_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a3, $zero, 255
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB92_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    addi.w $a5, $a1, 0
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB92_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_i8_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB92_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    addi.w $a5, $a1, 0
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB92_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i8 %b seq_cst
++  ret i8 %1
++}
++
++define i8 @atomicrmw_xchg_0_i8_seq_cst(ptr %a) nounwind {
++; LA32-LABEL: atomicrmw_xchg_0_i8_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a1, $zero, -4
++; LA32-NEXT:    and $a1, $a0, $a1
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a2, $zero, 255
++; LA32-NEXT:    sll.w $a2, $a2, $a0
++; LA32-NEXT:  .LBB93_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a1, 0
++; LA32-NEXT:    addi.w $a4, $zero, 0
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    and $a4, $a4, $a2
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    sc.w $a4, $a1, 0
++; LA32-NEXT:    beqz $a4, .LBB93_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_0_i8_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a1, $zero, -4
++; LA64-NEXT:    and $a1, $a0, $a1
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a2, $zero, 255
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:  .LBB93_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a3, $a1, 0
++; LA64-NEXT:    addi.w $a4, $zero, 0
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    and $a4, $a4, $a2
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    sc.w $a4, $a1, 0
++; LA64-NEXT:    beqz $a4, .LBB93_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i8 0 seq_cst
++  ret i8 %1
++}
++
++define i8 @atomicrmw_xchg_minus_1_i8_seq_cst(ptr %a) nounwind {
++; LA32-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a1, $zero, -4
++; LA32-NEXT:    and $a1, $a0, $a1
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a2, $zero, 255
++; LA32-NEXT:    sll.w $a2, $a2, $a0
++; LA32-NEXT:  .LBB94_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a1, 0
++; LA32-NEXT:    addi.w $a4, $a2, 0
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    and $a4, $a4, $a2
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    sc.w $a4, $a1, 0
++; LA32-NEXT:    beqz $a4, .LBB94_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_minus_1_i8_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a1, $zero, -4
++; LA64-NEXT:    and $a1, $a0, $a1
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a2, $zero, 255
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:  .LBB94_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a3, $a1, 0
++; LA64-NEXT:    addi.w $a4, $a2, 0
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    and $a4, $a4, $a2
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    sc.w $a4, $a1, 0
++; LA64-NEXT:    beqz $a4, .LBB94_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i8 -1 seq_cst
++  ret i8 %1
++}
++
++define i16 @atomicrmw_xchg_i16_seq_cst(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_xchg_i16_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a3, 15
++; LA32-NEXT:    ori $a3, $a3, 4095
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB95_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    addi.w $a5, $a1, 0
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB95_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_i16_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB95_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    addi.w $a5, $a1, 0
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB95_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i16 %b seq_cst
++  ret i16 %1
++}
++
++define i16 @atomicrmw_xchg_0_i16_seq_cst(ptr %a) nounwind {
++; LA32-LABEL: atomicrmw_xchg_0_i16_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a1, $zero, -4
++; LA32-NEXT:    and $a1, $a0, $a1
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a2, 15
++; LA32-NEXT:    ori $a2, $a2, 4095
++; LA32-NEXT:    sll.w $a2, $a2, $a0
++; LA32-NEXT:  .LBB96_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a1, 0
++; LA32-NEXT:    addi.w $a4, $zero, 0
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    and $a4, $a4, $a2
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    sc.w $a4, $a1, 0
++; LA32-NEXT:    beqz $a4, .LBB96_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_0_i16_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a1, $zero, -4
++; LA64-NEXT:    and $a1, $a0, $a1
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a2, 15
++; LA64-NEXT:    ori $a2, $a2, 4095
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:  .LBB96_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a3, $a1, 0
++; LA64-NEXT:    addi.w $a4, $zero, 0
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    and $a4, $a4, $a2
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    sc.w $a4, $a1, 0
++; LA64-NEXT:    beqz $a4, .LBB96_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i16 0 seq_cst
++  ret i16 %1
++}
++
++define i16 @atomicrmw_xchg_minus_1_i16_seq_cst(ptr %a) nounwind {
++; LA32-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a1, $zero, -4
++; LA32-NEXT:    and $a1, $a0, $a1
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a2, 15
++; LA32-NEXT:    ori $a2, $a2, 4095
++; LA32-NEXT:    sll.w $a2, $a2, $a0
++; LA32-NEXT:  .LBB97_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a1, 0
++; LA32-NEXT:    addi.w $a4, $a2, 0
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    and $a4, $a4, $a2
++; LA32-NEXT:    xor $a4, $a3, $a4
++; LA32-NEXT:    sc.w $a4, $a1, 0
++; LA32-NEXT:    beqz $a4, .LBB97_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_minus_1_i16_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a1, $zero, -4
++; LA64-NEXT:    and $a1, $a0, $a1
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a2, 15
++; LA64-NEXT:    ori $a2, $a2, 4095
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:  .LBB97_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a3, $a1, 0
++; LA64-NEXT:    addi.w $a4, $a2, 0
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    and $a4, $a4, $a2
++; LA64-NEXT:    xor $a4, $a3, $a4
++; LA64-NEXT:    sc.w $a4, $a1, 0
++; LA64-NEXT:    beqz $a4, .LBB97_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i16 -1 seq_cst
++  ret i16 %1
++}
++
++define i32 @atomicrmw_xchg_i32_seq_cst(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_xchg_i32_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB98_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    move $a3, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB98_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_i32_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amswap_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i32 %b seq_cst
++  ret i32 %1
++}
++
++define i64 @atomicrmw_xchg_i64_seq_cst(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_xchg_i64_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 5
++; LA32-NEXT:    bl %plt(__atomic_exchange_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xchg_i64_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amswap_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw xchg ptr %a, i64 %b seq_cst
++  ret i64 %1
++}
++
++define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_add_i8_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a3, $zero, 255
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB100_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    add.w $a5, $a4, $a1
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB100_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_add_i8_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB100_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    add.w $a5, $a4, $a1
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB100_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw add ptr %a, i8 %b seq_cst
++  ret i8 %1
++}
++
++define i16 @atomicrmw_add_i16_seq_cst(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_add_i16_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a3, 15
++; LA32-NEXT:    ori $a3, $a3, 4095
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB101_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    add.w $a5, $a4, $a1
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB101_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_add_i16_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB101_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    add.w $a5, $a4, $a1
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB101_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw add ptr %a, i16 %b seq_cst
++  ret i16 %1
++}
++
++define i32 @atomicrmw_add_i32_seq_cst(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_add_i32_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB102_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    add.w $a3, $a2, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB102_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_add_i32_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amadd_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw add ptr %a, i32 %b seq_cst
++  ret i32 %1
++}
++
++define i64 @atomicrmw_add_i64_seq_cst(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_add_i64_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 5
++; LA32-NEXT:    bl %plt(__atomic_fetch_add_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_add_i64_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amadd_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw add ptr %a, i64 %b seq_cst
++  ret i64 %1
++}
++
++define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_sub_i8_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a3, $zero, 255
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB104_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    sub.w $a5, $a4, $a1
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB104_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_sub_i8_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB104_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    sub.w $a5, $a4, $a1
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB104_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw sub ptr %a, i8 %b seq_cst
++  ret i8 %1
++}
++
++define i16 @atomicrmw_sub_i16_seq_cst(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_sub_i16_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a3, 15
++; LA32-NEXT:    ori $a3, $a3, 4095
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB105_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    sub.w $a5, $a4, $a1
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB105_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_sub_i16_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB105_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    sub.w $a5, $a4, $a1
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB105_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw sub ptr %a, i16 %b seq_cst
++  ret i16 %1
++}
++
++define i32 @atomicrmw_sub_i32_seq_cst(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_sub_i32_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB106_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    sub.w $a3, $a2, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB106_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_sub_i32_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    sub.w $a2, $zero, $a1
++; LA64-NEXT:    amadd_db.w $a1, $a2, $a0
++; LA64-NEXT:    move $a0, $a1
++; LA64-NEXT:    ret
++  %1 = atomicrmw sub ptr %a, i32 %b seq_cst
++  ret i32 %1
++}
++
++define i64 @atomicrmw_sub_i64_seq_cst(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_sub_i64_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 5
++; LA32-NEXT:    bl %plt(__atomic_fetch_sub_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_sub_i64_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    sub.d $a2, $zero, $a1
++; LA64-NEXT:    amadd_db.d $a1, $a2, $a0
++; LA64-NEXT:    move $a0, $a1
++; LA64-NEXT:    ret
++  %1 = atomicrmw sub ptr %a, i64 %b seq_cst
++  ret i64 %1
++}
++
++define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_nand_i8_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    ori $a3, $zero, 255
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB108_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    and $a5, $a4, $a1
++; LA32-NEXT:    nor $a5, $a5, $zero
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB108_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_nand_i8_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB108_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a5, $a4, $a1
++; LA64-NEXT:    nor $a5, $a5, $zero
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB108_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw nand ptr %a, i8 %b seq_cst
++  ret i8 %1
++}
++
++define i16 @atomicrmw_nand_i16_seq_cst(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_nand_i16_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    lu12i.w $a3, 15
++; LA32-NEXT:    ori $a3, $a3, 4095
++; LA32-NEXT:    sll.w $a3, $a3, $a0
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB109_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a4, $a2, 0
++; LA32-NEXT:    and $a5, $a4, $a1
++; LA32-NEXT:    nor $a5, $a5, $zero
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    and $a5, $a5, $a3
++; LA32-NEXT:    xor $a5, $a4, $a5
++; LA32-NEXT:    sc.w $a5, $a2, 0
++; LA32-NEXT:    beqz $a5, .LBB109_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a4, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_nand_i16_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    lu12i.w $a3, 15
++; LA64-NEXT:    ori $a3, $a3, 4095
++; LA64-NEXT:    sll.w $a3, $a3, $a0
++; LA64-NEXT:    addi.w $a3, $a3, 0
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB109_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a2, 0
++; LA64-NEXT:    and $a5, $a4, $a1
++; LA64-NEXT:    nor $a5, $a5, $zero
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    and $a5, $a5, $a3
++; LA64-NEXT:    xor $a5, $a4, $a5
++; LA64-NEXT:    sc.w $a5, $a2, 0
++; LA64-NEXT:    beqz $a5, .LBB109_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    srl.w $a0, $a4, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw nand ptr %a, i16 %b seq_cst
++  ret i16 %1
++}
++
++define i32 @atomicrmw_nand_i32_seq_cst(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_nand_i32_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB110_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    and $a3, $a2, $a1
++; LA32-NEXT:    nor $a3, $a3, $zero
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB110_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_nand_i32_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:  .LBB110_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a2, $a0, 0
++; LA64-NEXT:    and $a3, $a2, $a1
++; LA64-NEXT:    nor $a3, $a3, $zero
++; LA64-NEXT:    sc.w $a3, $a0, 0
++; LA64-NEXT:    beqz $a3, .LBB110_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw nand ptr %a, i32 %b seq_cst
++  ret i32 %1
++}
++
++define i64 @atomicrmw_nand_i64_seq_cst(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_nand_i64_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 5
++; LA32-NEXT:    bl %plt(__atomic_fetch_nand_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_nand_i64_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:  .LBB111_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.d $a2, $a0, 0
++; LA64-NEXT:    and $a3, $a2, $a1
++; LA64-NEXT:    nor $a3, $a3, $zero
++; LA64-NEXT:    sc.d $a3, $a0, 0
++; LA64-NEXT:    beqz $a3, .LBB111_1
++; LA64-NEXT:  # %bb.2:
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw nand ptr %a, i64 %b seq_cst
++  ret i64 %1
++}
++
++define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_and_i8_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    slli.w $a2, $a0, 3
++; LA32-NEXT:    ori $a3, $zero, 255
++; LA32-NEXT:    sll.w $a3, $a3, $a2
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a2
++; LA32-NEXT:    orn $a1, $a1, $a3
++; LA32-NEXT:    addi.w $a3, $zero, -4
++; LA32-NEXT:    and $a0, $a0, $a3
++; LA32-NEXT:  .LBB112_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a0, 0
++; LA32-NEXT:    and $a4, $a3, $a1
++; LA32-NEXT:    sc.w $a4, $a0, 0
++; LA32-NEXT:    beqz $a4, .LBB112_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_and_i8_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    slli.d $a2, $a0, 3
++; LA64-NEXT:    ori $a3, $zero, 255
++; LA64-NEXT:    sll.w $a3, $a3, $a2
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a2
++; LA64-NEXT:    orn $a1, $a1, $a3
++; LA64-NEXT:    addi.w $a3, $zero, -4
++; LA64-NEXT:    and $a0, $a0, $a3
++; LA64-NEXT:    amand_db.w $a3, $a1, $a0
++; LA64-NEXT:    srl.w $a0, $a3, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw and ptr %a, i8 %b seq_cst
++  ret i8 %1
++}
++
++define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_and_i16_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    lu12i.w $a2, 15
++; LA32-NEXT:    ori $a2, $a2, 4095
++; LA32-NEXT:    slli.w $a3, $a0, 3
++; LA32-NEXT:    sll.w $a2, $a2, $a3
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a3
++; LA32-NEXT:    orn $a1, $a1, $a2
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a0, $a0, $a2
++; LA32-NEXT:  .LBB113_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    and $a4, $a2, $a1
++; LA32-NEXT:    sc.w $a4, $a0, 0
++; LA32-NEXT:    beqz $a4, .LBB113_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a2, $a3
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_and_i16_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    lu12i.w $a2, 15
++; LA64-NEXT:    ori $a2, $a2, 4095
++; LA64-NEXT:    slli.d $a3, $a0, 3
++; LA64-NEXT:    sll.w $a2, $a2, $a3
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a3
++; LA64-NEXT:    orn $a1, $a1, $a2
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a0, $a0, $a2
++; LA64-NEXT:    amand_db.w $a2, $a1, $a0
++; LA64-NEXT:    srl.w $a0, $a2, $a3
++; LA64-NEXT:    ret
++  %1 = atomicrmw and ptr %a, i16 %b seq_cst
++  ret i16 %1
++}
++
++define i32 @atomicrmw_and_i32_seq_cst(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_and_i32_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB114_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    and $a3, $a2, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB114_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_and_i32_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amand_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw and ptr %a, i32 %b seq_cst
++  ret i32 %1
++}
++
++define i64 @atomicrmw_and_i64_seq_cst(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_and_i64_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 5
++; LA32-NEXT:    bl %plt(__atomic_fetch_and_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_and_i64_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amand_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw and ptr %a, i64 %b seq_cst
++  ret i64 %1
++}
++
++define i8 @atomicrmw_or_i8_seq_cst(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_or_i8_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB116_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a2, 0
++; LA32-NEXT:    or $a4, $a3, $a1
++; LA32-NEXT:    sc.w $a4, $a2, 0
++; LA32-NEXT:    beqz $a4, .LBB116_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_or_i8_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    amor_db.w $a3, $a1, $a2
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw or ptr %a, i8 %b seq_cst
++  ret i8 %1
++}
++
++define i16 @atomicrmw_or_i16_seq_cst(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_or_i16_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB117_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a2, 0
++; LA32-NEXT:    or $a4, $a3, $a1
++; LA32-NEXT:    sc.w $a4, $a2, 0
++; LA32-NEXT:    beqz $a4, .LBB117_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_or_i16_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    amor_db.w $a3, $a1, $a2
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw or ptr %a, i16 %b seq_cst
++  ret i16 %1
++}
++
++define i32 @atomicrmw_or_i32_seq_cst(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_or_i32_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB118_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    or $a3, $a2, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB118_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_or_i32_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amor_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw or ptr %a, i32 %b seq_cst
++  ret i32 %1
++}
++
++define i64 @atomicrmw_or_i64_seq_cst(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_or_i64_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 5
++; LA32-NEXT:    bl %plt(__atomic_fetch_or_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_or_i64_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amor_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw or ptr %a, i64 %b seq_cst
++  ret i64 %1
++}
++
++define i8 @atomicrmw_xor_i8_seq_cst(ptr %a, i8 %b) nounwind {
++; LA32-LABEL: atomicrmw_xor_i8_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    andi $a1, $a1, 255
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB120_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a2, 0
++; LA32-NEXT:    xor $a4, $a3, $a1
++; LA32-NEXT:    sc.w $a4, $a2, 0
++; LA32-NEXT:    beqz $a4, .LBB120_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xor_i8_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    amxor_db.w $a3, $a1, $a2
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xor ptr %a, i8 %b seq_cst
++  ret i8 %1
++}
++
++define i16 @atomicrmw_xor_i16_seq_cst(ptr %a, i16 %b) nounwind {
++; LA32-LABEL: atomicrmw_xor_i16_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $a2, $zero, -4
++; LA32-NEXT:    and $a2, $a0, $a2
++; LA32-NEXT:    slli.w $a0, $a0, 3
++; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
++; LA32-NEXT:    sll.w $a1, $a1, $a0
++; LA32-NEXT:  .LBB121_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a3, $a2, 0
++; LA32-NEXT:    xor $a4, $a3, $a1
++; LA32-NEXT:    sc.w $a4, $a2, 0
++; LA32-NEXT:    beqz $a4, .LBB121_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    srl.w $a0, $a3, $a0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xor_i16_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a2, $zero, -4
++; LA64-NEXT:    and $a2, $a0, $a2
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    amxor_db.w $a3, $a1, $a2
++; LA64-NEXT:    srl.w $a0, $a3, $a0
++; LA64-NEXT:    ret
++  %1 = atomicrmw xor ptr %a, i16 %b seq_cst
++  ret i16 %1
++}
++
++define i32 @atomicrmw_xor_i32_seq_cst(ptr %a, i32 %b) nounwind {
++; LA32-LABEL: atomicrmw_xor_i32_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:  .LBB122_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:    ll.w $a2, $a0, 0
++; LA32-NEXT:    xor $a3, $a2, $a1
++; LA32-NEXT:    sc.w $a3, $a0, 0
++; LA32-NEXT:    beqz $a3, .LBB122_1
++; LA32-NEXT:  # %bb.2:
++; LA32-NEXT:    move $a0, $a2
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xor_i32_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amxor_db.w $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw xor ptr %a, i32 %b seq_cst
++  ret i32 %1
++}
++
++define i64 @atomicrmw_xor_i64_seq_cst(ptr %a, i64 %b) nounwind {
++; LA32-LABEL: atomicrmw_xor_i64_seq_cst:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    ori $a3, $zero, 5
++; LA32-NEXT:    bl %plt(__atomic_fetch_xor_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: atomicrmw_xor_i64_seq_cst:
++; LA64:       # %bb.0:
++; LA64-NEXT:    amxor_db.d $a2, $a1, $a0
++; LA64-NEXT:    move $a0, $a2
++; LA64-NEXT:    ret
++  %1 = atomicrmw xor ptr %a, i64 %b seq_cst
++  ret i64 %1
++}
++
+ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
+ ; LA32-LABEL: atomicrmw_xchg_i8_monotonic:
+ ; LA32:       # %bb.0:
+@@ -910,14 +4132,14 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
+ ; LA32-NEXT:    sll.w $a3, $a3, $a0
+ ; LA32-NEXT:    andi $a1, $a1, 255
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+-; LA32-NEXT:  .LBB28_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB124_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a4, $a2, 0
+ ; LA32-NEXT:    addi.w $a5, $a1, 0
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+ ; LA32-NEXT:    and $a5, $a5, $a3
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+ ; LA32-NEXT:    sc.w $a5, $a2, 0
+-; LA32-NEXT:    beqz $a5, .LBB28_1
++; LA32-NEXT:    beqz $a5, .LBB124_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    srl.w $a0, $a4, $a0
+ ; LA32-NEXT:    ret
+@@ -933,14 +4155,14 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
+ ; LA64-NEXT:    andi $a1, $a1, 255
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+-; LA64-NEXT:  .LBB28_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB124_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    addi.w $a5, $a1, 0
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+ ; LA64-NEXT:    and $a5, $a5, $a3
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+ ; LA64-NEXT:    sc.w $a5, $a2, 0
+-; LA64-NEXT:    beqz $a5, .LBB28_1
++; LA64-NEXT:    beqz $a5, .LBB124_1
+ ; LA64-NEXT:  # %bb.2:
+ ; LA64-NEXT:    srl.w $a0, $a4, $a0
+ ; LA64-NEXT:    ret
+@@ -959,14 +4181,14 @@ define i16 @atomicrmw_xchg_i16_monotonic(ptr %a, i16 %b) nounwind {
+ ; LA32-NEXT:    sll.w $a3, $a3, $a0
+ ; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+-; LA32-NEXT:  .LBB29_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB125_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a4, $a2, 0
+ ; LA32-NEXT:    addi.w $a5, $a1, 0
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+ ; LA32-NEXT:    and $a5, $a5, $a3
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+ ; LA32-NEXT:    sc.w $a5, $a2, 0
+-; LA32-NEXT:    beqz $a5, .LBB29_1
++; LA32-NEXT:    beqz $a5, .LBB125_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    srl.w $a0, $a4, $a0
+ ; LA32-NEXT:    ret
+@@ -983,14 +4205,14 @@ define i16 @atomicrmw_xchg_i16_monotonic(ptr %a, i16 %b) nounwind {
+ ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+-; LA64-NEXT:  .LBB29_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB125_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    addi.w $a5, $a1, 0
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+ ; LA64-NEXT:    and $a5, $a5, $a3
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+ ; LA64-NEXT:    sc.w $a5, $a2, 0
+-; LA64-NEXT:    beqz $a5, .LBB29_1
++; LA64-NEXT:    beqz $a5, .LBB125_1
+ ; LA64-NEXT:  # %bb.2:
+ ; LA64-NEXT:    srl.w $a0, $a4, $a0
+ ; LA64-NEXT:    ret
+@@ -1001,11 +4223,11 @@ define i16 @atomicrmw_xchg_i16_monotonic(ptr %a, i16 %b) nounwind {
+ define i32 @atomicrmw_xchg_i32_monotonic(ptr %a, i32 %b) nounwind {
+ ; LA32-LABEL: atomicrmw_xchg_i32_monotonic:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:  .LBB30_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB126_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a2, $a0, 0
+ ; LA32-NEXT:    move $a3, $a1
+ ; LA32-NEXT:    sc.w $a3, $a0, 0
+-; LA32-NEXT:    beqz $a3, .LBB30_1
++; LA32-NEXT:    beqz $a3, .LBB126_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    move $a0, $a2
+ ; LA32-NEXT:    ret
+@@ -1049,14 +4271,14 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
+ ; LA32-NEXT:    sll.w $a3, $a3, $a0
+ ; LA32-NEXT:    andi $a1, $a1, 255
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+-; LA32-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB128_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a4, $a2, 0
+ ; LA32-NEXT:    add.w $a5, $a4, $a1
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+ ; LA32-NEXT:    and $a5, $a5, $a3
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+ ; LA32-NEXT:    sc.w $a5, $a2, 0
+-; LA32-NEXT:    beqz $a5, .LBB32_1
++; LA32-NEXT:    beqz $a5, .LBB128_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    srl.w $a0, $a4, $a0
+ ; LA32-NEXT:    ret
+@@ -1072,14 +4294,14 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
+ ; LA64-NEXT:    andi $a1, $a1, 255
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+-; LA64-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB128_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    add.w $a5, $a4, $a1
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+ ; LA64-NEXT:    and $a5, $a5, $a3
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+ ; LA64-NEXT:    sc.w $a5, $a2, 0
+-; LA64-NEXT:    beqz $a5, .LBB32_1
++; LA64-NEXT:    beqz $a5, .LBB128_1
+ ; LA64-NEXT:  # %bb.2:
+ ; LA64-NEXT:    srl.w $a0, $a4, $a0
+ ; LA64-NEXT:    ret
+@@ -1098,14 +4320,14 @@ define i16 @atomicrmw_add_i16_monotonic(ptr %a, i16 %b) nounwind {
+ ; LA32-NEXT:    sll.w $a3, $a3, $a0
+ ; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+-; LA32-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB129_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a4, $a2, 0
+ ; LA32-NEXT:    add.w $a5, $a4, $a1
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+ ; LA32-NEXT:    and $a5, $a5, $a3
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+ ; LA32-NEXT:    sc.w $a5, $a2, 0
+-; LA32-NEXT:    beqz $a5, .LBB33_1
++; LA32-NEXT:    beqz $a5, .LBB129_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    srl.w $a0, $a4, $a0
+ ; LA32-NEXT:    ret
+@@ -1122,14 +4344,14 @@ define i16 @atomicrmw_add_i16_monotonic(ptr %a, i16 %b) nounwind {
+ ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+-; LA64-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB129_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    add.w $a5, $a4, $a1
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+ ; LA64-NEXT:    and $a5, $a5, $a3
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+ ; LA64-NEXT:    sc.w $a5, $a2, 0
+-; LA64-NEXT:    beqz $a5, .LBB33_1
++; LA64-NEXT:    beqz $a5, .LBB129_1
+ ; LA64-NEXT:  # %bb.2:
+ ; LA64-NEXT:    srl.w $a0, $a4, $a0
+ ; LA64-NEXT:    ret
+@@ -1140,11 +4362,11 @@ define i16 @atomicrmw_add_i16_monotonic(ptr %a, i16 %b) nounwind {
+ define i32 @atomicrmw_add_i32_monotonic(ptr %a, i32 %b) nounwind {
+ ; LA32-LABEL: atomicrmw_add_i32_monotonic:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:  .LBB34_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB130_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a2, $a0, 0
+ ; LA32-NEXT:    add.w $a3, $a2, $a1
+ ; LA32-NEXT:    sc.w $a3, $a0, 0
+-; LA32-NEXT:    beqz $a3, .LBB34_1
++; LA32-NEXT:    beqz $a3, .LBB130_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    move $a0, $a2
+ ; LA32-NEXT:    ret
+@@ -1188,14 +4410,14 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
+ ; LA32-NEXT:    sll.w $a3, $a3, $a0
+ ; LA32-NEXT:    andi $a1, $a1, 255
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+-; LA32-NEXT:  .LBB36_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB132_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a4, $a2, 0
+ ; LA32-NEXT:    sub.w $a5, $a4, $a1
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+ ; LA32-NEXT:    and $a5, $a5, $a3
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+ ; LA32-NEXT:    sc.w $a5, $a2, 0
+-; LA32-NEXT:    beqz $a5, .LBB36_1
++; LA32-NEXT:    beqz $a5, .LBB132_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    srl.w $a0, $a4, $a0
+ ; LA32-NEXT:    ret
+@@ -1211,14 +4433,14 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
+ ; LA64-NEXT:    andi $a1, $a1, 255
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+-; LA64-NEXT:  .LBB36_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB132_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    sub.w $a5, $a4, $a1
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+ ; LA64-NEXT:    and $a5, $a5, $a3
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+ ; LA64-NEXT:    sc.w $a5, $a2, 0
+-; LA64-NEXT:    beqz $a5, .LBB36_1
++; LA64-NEXT:    beqz $a5, .LBB132_1
+ ; LA64-NEXT:  # %bb.2:
+ ; LA64-NEXT:    srl.w $a0, $a4, $a0
+ ; LA64-NEXT:    ret
+@@ -1237,14 +4459,14 @@ define i16 @atomicrmw_sub_i16_monotonic(ptr %a, i16 %b) nounwind {
+ ; LA32-NEXT:    sll.w $a3, $a3, $a0
+ ; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+-; LA32-NEXT:  .LBB37_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB133_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a4, $a2, 0
+ ; LA32-NEXT:    sub.w $a5, $a4, $a1
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+ ; LA32-NEXT:    and $a5, $a5, $a3
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+ ; LA32-NEXT:    sc.w $a5, $a2, 0
+-; LA32-NEXT:    beqz $a5, .LBB37_1
++; LA32-NEXT:    beqz $a5, .LBB133_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    srl.w $a0, $a4, $a0
+ ; LA32-NEXT:    ret
+@@ -1261,14 +4483,14 @@ define i16 @atomicrmw_sub_i16_monotonic(ptr %a, i16 %b) nounwind {
+ ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+-; LA64-NEXT:  .LBB37_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB133_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    sub.w $a5, $a4, $a1
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+ ; LA64-NEXT:    and $a5, $a5, $a3
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+ ; LA64-NEXT:    sc.w $a5, $a2, 0
+-; LA64-NEXT:    beqz $a5, .LBB37_1
++; LA64-NEXT:    beqz $a5, .LBB133_1
+ ; LA64-NEXT:  # %bb.2:
+ ; LA64-NEXT:    srl.w $a0, $a4, $a0
+ ; LA64-NEXT:    ret
+@@ -1279,11 +4501,11 @@ define i16 @atomicrmw_sub_i16_monotonic(ptr %a, i16 %b) nounwind {
+ define i32 @atomicrmw_sub_i32_monotonic(ptr %a, i32 %b) nounwind {
+ ; LA32-LABEL: atomicrmw_sub_i32_monotonic:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:  .LBB38_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB134_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a2, $a0, 0
+ ; LA32-NEXT:    sub.w $a3, $a2, $a1
+ ; LA32-NEXT:    sc.w $a3, $a0, 0
+-; LA32-NEXT:    beqz $a3, .LBB38_1
++; LA32-NEXT:    beqz $a3, .LBB134_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    move $a0, $a2
+ ; LA32-NEXT:    ret
+@@ -1329,7 +4551,7 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
+ ; LA32-NEXT:    sll.w $a3, $a3, $a0
+ ; LA32-NEXT:    andi $a1, $a1, 255
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+-; LA32-NEXT:  .LBB40_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB136_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a4, $a2, 0
+ ; LA32-NEXT:    and $a5, $a4, $a1
+ ; LA32-NEXT:    nor $a5, $a5, $zero
+@@ -1337,7 +4559,7 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
+ ; LA32-NEXT:    and $a5, $a5, $a3
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+ ; LA32-NEXT:    sc.w $a5, $a2, 0
+-; LA32-NEXT:    beqz $a5, .LBB40_1
++; LA32-NEXT:    beqz $a5, .LBB136_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    srl.w $a0, $a4, $a0
+ ; LA32-NEXT:    ret
+@@ -1353,7 +4575,7 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
+ ; LA64-NEXT:    andi $a1, $a1, 255
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+-; LA64-NEXT:  .LBB40_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB136_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    and $a5, $a4, $a1
+ ; LA64-NEXT:    nor $a5, $a5, $zero
+@@ -1361,7 +4583,7 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
+ ; LA64-NEXT:    and $a5, $a5, $a3
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+ ; LA64-NEXT:    sc.w $a5, $a2, 0
+-; LA64-NEXT:    beqz $a5, .LBB40_1
++; LA64-NEXT:    beqz $a5, .LBB136_1
+ ; LA64-NEXT:  # %bb.2:
+ ; LA64-NEXT:    srl.w $a0, $a4, $a0
+ ; LA64-NEXT:    ret
+@@ -1380,7 +4602,7 @@ define i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
+ ; LA32-NEXT:    sll.w $a3, $a3, $a0
+ ; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+-; LA32-NEXT:  .LBB41_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB137_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a4, $a2, 0
+ ; LA32-NEXT:    and $a5, $a4, $a1
+ ; LA32-NEXT:    nor $a5, $a5, $zero
+@@ -1388,7 +4610,7 @@ define i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
+ ; LA32-NEXT:    and $a5, $a5, $a3
+ ; LA32-NEXT:    xor $a5, $a4, $a5
+ ; LA32-NEXT:    sc.w $a5, $a2, 0
+-; LA32-NEXT:    beqz $a5, .LBB41_1
++; LA32-NEXT:    beqz $a5, .LBB137_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    srl.w $a0, $a4, $a0
+ ; LA32-NEXT:    ret
+@@ -1405,7 +4627,7 @@ define i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
+ ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+-; LA64-NEXT:  .LBB41_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB137_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a4, $a2, 0
+ ; LA64-NEXT:    and $a5, $a4, $a1
+ ; LA64-NEXT:    nor $a5, $a5, $zero
+@@ -1413,7 +4635,7 @@ define i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
+ ; LA64-NEXT:    and $a5, $a5, $a3
+ ; LA64-NEXT:    xor $a5, $a4, $a5
+ ; LA64-NEXT:    sc.w $a5, $a2, 0
+-; LA64-NEXT:    beqz $a5, .LBB41_1
++; LA64-NEXT:    beqz $a5, .LBB137_1
+ ; LA64-NEXT:  # %bb.2:
+ ; LA64-NEXT:    srl.w $a0, $a4, $a0
+ ; LA64-NEXT:    ret
+@@ -1424,24 +4646,24 @@ define i16 @atomicrmw_nand_i16_monotonic(ptr %a, i16 %b) nounwind {
+ define i32 @atomicrmw_nand_i32_monotonic(ptr %a, i32 %b) nounwind {
+ ; LA32-LABEL: atomicrmw_nand_i32_monotonic:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:  .LBB42_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB138_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a2, $a0, 0
+ ; LA32-NEXT:    and $a3, $a2, $a1
+ ; LA32-NEXT:    nor $a3, $a3, $zero
+ ; LA32-NEXT:    sc.w $a3, $a0, 0
+-; LA32-NEXT:    beqz $a3, .LBB42_1
++; LA32-NEXT:    beqz $a3, .LBB138_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    move $a0, $a2
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: atomicrmw_nand_i32_monotonic:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:  .LBB42_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB138_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a2, $a0, 0
+ ; LA64-NEXT:    and $a3, $a2, $a1
+ ; LA64-NEXT:    nor $a3, $a3, $zero
+ ; LA64-NEXT:    sc.w $a3, $a0, 0
+-; LA64-NEXT:    beqz $a3, .LBB42_1
++; LA64-NEXT:    beqz $a3, .LBB138_1
+ ; LA64-NEXT:  # %bb.2:
+ ; LA64-NEXT:    move $a0, $a2
+ ; LA64-NEXT:    ret
+@@ -1462,12 +4684,12 @@ define i64 @atomicrmw_nand_i64_monotonic(ptr %a, i64 %b) nounwind {
+ ;
+ ; LA64-LABEL: atomicrmw_nand_i64_monotonic:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:  .LBB43_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB139_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.d $a2, $a0, 0
+ ; LA64-NEXT:    and $a3, $a2, $a1
+ ; LA64-NEXT:    nor $a3, $a3, $zero
+ ; LA64-NEXT:    sc.d $a3, $a0, 0
+-; LA64-NEXT:    beqz $a3, .LBB43_1
++; LA64-NEXT:    beqz $a3, .LBB139_1
+ ; LA64-NEXT:  # %bb.2:
+ ; LA64-NEXT:    move $a0, $a2
+ ; LA64-NEXT:    ret
+@@ -1486,11 +4708,11 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
+ ; LA32-NEXT:    orn $a1, $a1, $a3
+ ; LA32-NEXT:    addi.w $a3, $zero, -4
+ ; LA32-NEXT:    and $a0, $a0, $a3
+-; LA32-NEXT:  .LBB44_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB140_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a3, $a0, 0
+ ; LA32-NEXT:    and $a4, $a3, $a1
+ ; LA32-NEXT:    sc.w $a4, $a0, 0
+-; LA32-NEXT:    beqz $a4, .LBB44_1
++; LA32-NEXT:    beqz $a4, .LBB140_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    srl.w $a0, $a3, $a2
+ ; LA32-NEXT:    ret
+@@ -1524,11 +4746,11 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
+ ; LA32-NEXT:    orn $a1, $a1, $a2
+ ; LA32-NEXT:    addi.w $a2, $zero, -4
+ ; LA32-NEXT:    and $a0, $a0, $a2
+-; LA32-NEXT:  .LBB45_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB141_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a2, $a0, 0
+ ; LA32-NEXT:    and $a4, $a2, $a1
+ ; LA32-NEXT:    sc.w $a4, $a0, 0
+-; LA32-NEXT:    beqz $a4, .LBB45_1
++; LA32-NEXT:    beqz $a4, .LBB141_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    srl.w $a0, $a2, $a3
+ ; LA32-NEXT:    ret
+@@ -1554,11 +4776,11 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
+ define i32 @atomicrmw_and_i32_monotonic(ptr %a, i32 %b) nounwind {
+ ; LA32-LABEL: atomicrmw_and_i32_monotonic:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB142_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a2, $a0, 0
+ ; LA32-NEXT:    and $a3, $a2, $a1
+ ; LA32-NEXT:    sc.w $a3, $a0, 0
+-; LA32-NEXT:    beqz $a3, .LBB46_1
++; LA32-NEXT:    beqz $a3, .LBB142_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    move $a0, $a2
+ ; LA32-NEXT:    ret
+@@ -1600,11 +4822,11 @@ define i8 @atomicrmw_or_i8_monotonic(ptr %a, i8 %b) nounwind {
+ ; LA32-NEXT:    slli.w $a0, $a0, 3
+ ; LA32-NEXT:    andi $a1, $a1, 255
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+-; LA32-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB144_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a3, $a2, 0
+ ; LA32-NEXT:    or $a4, $a3, $a1
+ ; LA32-NEXT:    sc.w $a4, $a2, 0
+-; LA32-NEXT:    beqz $a4, .LBB48_1
++; LA32-NEXT:    beqz $a4, .LBB144_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    srl.w $a0, $a3, $a0
+ ; LA32-NEXT:    ret
+@@ -1631,11 +4853,11 @@ define i16 @atomicrmw_or_i16_monotonic(ptr %a, i16 %b) nounwind {
+ ; LA32-NEXT:    slli.w $a0, $a0, 3
+ ; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+-; LA32-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB145_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a3, $a2, 0
+ ; LA32-NEXT:    or $a4, $a3, $a1
+ ; LA32-NEXT:    sc.w $a4, $a2, 0
+-; LA32-NEXT:    beqz $a4, .LBB49_1
++; LA32-NEXT:    beqz $a4, .LBB145_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    srl.w $a0, $a3, $a0
+ ; LA32-NEXT:    ret
+@@ -1657,11 +4879,11 @@ define i16 @atomicrmw_or_i16_monotonic(ptr %a, i16 %b) nounwind {
+ define i32 @atomicrmw_or_i32_monotonic(ptr %a, i32 %b) nounwind {
+ ; LA32-LABEL: atomicrmw_or_i32_monotonic:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:  .LBB50_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB146_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a2, $a0, 0
+ ; LA32-NEXT:    or $a3, $a2, $a1
+ ; LA32-NEXT:    sc.w $a3, $a0, 0
+-; LA32-NEXT:    beqz $a3, .LBB50_1
++; LA32-NEXT:    beqz $a3, .LBB146_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    move $a0, $a2
+ ; LA32-NEXT:    ret
+@@ -1703,11 +4925,11 @@ define i8 @atomicrmw_xor_i8_monotonic(ptr %a, i8 %b) nounwind {
+ ; LA32-NEXT:    slli.w $a0, $a0, 3
+ ; LA32-NEXT:    andi $a1, $a1, 255
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+-; LA32-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB148_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a3, $a2, 0
+ ; LA32-NEXT:    xor $a4, $a3, $a1
+ ; LA32-NEXT:    sc.w $a4, $a2, 0
+-; LA32-NEXT:    beqz $a4, .LBB52_1
++; LA32-NEXT:    beqz $a4, .LBB148_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    srl.w $a0, $a3, $a0
+ ; LA32-NEXT:    ret
+@@ -1734,11 +4956,11 @@ define i16 @atomicrmw_xor_i16_monotonic(ptr %a, i16 %b) nounwind {
+ ; LA32-NEXT:    slli.w $a0, $a0, 3
+ ; LA32-NEXT:    bstrpick.w $a1, $a1, 15, 0
+ ; LA32-NEXT:    sll.w $a1, $a1, $a0
+-; LA32-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB149_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a3, $a2, 0
+ ; LA32-NEXT:    xor $a4, $a3, $a1
+ ; LA32-NEXT:    sc.w $a4, $a2, 0
+-; LA32-NEXT:    beqz $a4, .LBB53_1
++; LA32-NEXT:    beqz $a4, .LBB149_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    srl.w $a0, $a3, $a0
+ ; LA32-NEXT:    ret
+@@ -1760,11 +4982,11 @@ define i16 @atomicrmw_xor_i16_monotonic(ptr %a, i16 %b) nounwind {
+ define i32 @atomicrmw_xor_i32_monotonic(ptr %a, i32 %b) nounwind {
+ ; LA32-LABEL: atomicrmw_xor_i32_monotonic:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:  .LBB54_1: # =>This Inner Loop Header: Depth=1
++; LA32-NEXT:  .LBB150_1: # =>This Inner Loop Header: Depth=1
+ ; LA32-NEXT:    ll.w $a2, $a0, 0
+ ; LA32-NEXT:    xor $a3, $a2, $a1
+ ; LA32-NEXT:    sc.w $a3, $a0, 0
+-; LA32-NEXT:    beqz $a3, .LBB54_1
++; LA32-NEXT:    beqz $a3, .LBB150_1
+ ; LA32-NEXT:  # %bb.2:
+ ; LA32-NEXT:    move $a0, $a2
+ ; LA32-NEXT:    ret
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/fence-singlethread.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/fence-singlethread.ll
+new file mode 100644
+index 000000000000..8d6056bc7677
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/fence-singlethread.ll
+@@ -0,0 +1,17 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch32 < %s | FileCheck %s --check-prefix=LA32
++; RUN: llc --mtriple=loongarch64 < %s | FileCheck %s --check-prefix=LA64
++
++define void @fence_singlethread() {
++; LA32-LABEL: fence_singlethread:
++; LA32:       # %bb.0:
++; LA32-NEXT:    dbar 0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: fence_singlethread:
++; LA64:       # %bb.0:
++; LA64-NEXT:    dbar 0
++; LA64-NEXT:    ret
++  fence syncscope("singlethread") seq_cst
++  ret void
++}
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll
+index e91d0c145eab..deff11723d27 100644
+--- a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll
+@@ -72,6 +72,202 @@ define i64 @load_acquire_i64(ptr %ptr) {
+   ret i64 %val
+ }
+ 
++define i8 @load_unordered_i8(ptr %ptr) {
++; LA32-LABEL: load_unordered_i8:
++; LA32:       # %bb.0:
++; LA32-NEXT:    ld.b $a0, $a0, 0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: load_unordered_i8:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ld.b $a0, $a0, 0
++; LA64-NEXT:    ret
++  %val = load atomic i8, ptr %ptr unordered, align 1
++  ret i8 %val
++}
++
++define i16 @load_unordered_i16(ptr %ptr) {
++; LA32-LABEL: load_unordered_i16:
++; LA32:       # %bb.0:
++; LA32-NEXT:    ld.h $a0, $a0, 0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: load_unordered_i16:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ld.h $a0, $a0, 0
++; LA64-NEXT:    ret
++  %val = load atomic i16, ptr %ptr unordered, align 2
++  ret i16 %val
++}
++
++define i32 @load_unordered_i32(ptr %ptr) {
++; LA32-LABEL: load_unordered_i32:
++; LA32:       # %bb.0:
++; LA32-NEXT:    ld.w $a0, $a0, 0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: load_unordered_i32:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ld.w $a0, $a0, 0
++; LA64-NEXT:    ret
++  %val = load atomic i32, ptr %ptr unordered, align 4
++  ret i32 %val
++}
++
++define i64 @load_unordered_i64(ptr %ptr) {
++; LA32-LABEL: load_unordered_i64:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    .cfi_def_cfa_offset 16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    .cfi_offset 1, -4
++; LA32-NEXT:    move $a1, $zero
++; LA32-NEXT:    bl %plt(__atomic_load_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: load_unordered_i64:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ld.d $a0, $a0, 0
++; LA64-NEXT:    ret
++  %val = load atomic i64, ptr %ptr unordered, align 8
++  ret i64 %val
++}
++
++define i8 @load_monotonic_i8(ptr %ptr) {
++; LA32-LABEL: load_monotonic_i8:
++; LA32:       # %bb.0:
++; LA32-NEXT:    ld.b $a0, $a0, 0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: load_monotonic_i8:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ld.b $a0, $a0, 0
++; LA64-NEXT:    ret
++  %val = load atomic i8, ptr %ptr monotonic, align 1
++  ret i8 %val
++}
++
++define i16 @load_monotonic_i16(ptr %ptr) {
++; LA32-LABEL: load_monotonic_i16:
++; LA32:       # %bb.0:
++; LA32-NEXT:    ld.h $a0, $a0, 0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: load_monotonic_i16:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ld.h $a0, $a0, 0
++; LA64-NEXT:    ret
++  %val = load atomic i16, ptr %ptr monotonic, align 2
++  ret i16 %val
++}
++
++define i32 @load_monotonic_i32(ptr %ptr) {
++; LA32-LABEL: load_monotonic_i32:
++; LA32:       # %bb.0:
++; LA32-NEXT:    ld.w $a0, $a0, 0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: load_monotonic_i32:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ld.w $a0, $a0, 0
++; LA64-NEXT:    ret
++  %val = load atomic i32, ptr %ptr monotonic, align 4
++  ret i32 %val
++}
++
++define i64 @load_monotonic_i64(ptr %ptr) {
++; LA32-LABEL: load_monotonic_i64:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    .cfi_def_cfa_offset 16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    .cfi_offset 1, -4
++; LA32-NEXT:    move $a1, $zero
++; LA32-NEXT:    bl %plt(__atomic_load_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: load_monotonic_i64:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ld.d $a0, $a0, 0
++; LA64-NEXT:    ret
++  %val = load atomic i64, ptr %ptr monotonic, align 8
++  ret i64 %val
++}
++
++define i8 @load_seq_cst_i8(ptr %ptr) {
++; LA32-LABEL: load_seq_cst_i8:
++; LA32:       # %bb.0:
++; LA32-NEXT:    ld.b $a0, $a0, 0
++; LA32-NEXT:    dbar 0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: load_seq_cst_i8:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ld.b $a0, $a0, 0
++; LA64-NEXT:    dbar 0
++; LA64-NEXT:    ret
++  %val = load atomic i8, ptr %ptr seq_cst, align 1
++  ret i8 %val
++}
++
++define i16 @load_seq_cst_i16(ptr %ptr) {
++; LA32-LABEL: load_seq_cst_i16:
++; LA32:       # %bb.0:
++; LA32-NEXT:    ld.h $a0, $a0, 0
++; LA32-NEXT:    dbar 0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: load_seq_cst_i16:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ld.h $a0, $a0, 0
++; LA64-NEXT:    dbar 0
++; LA64-NEXT:    ret
++  %val = load atomic i16, ptr %ptr seq_cst, align 2
++  ret i16 %val
++}
++
++define i32 @load_seq_cst_i32(ptr %ptr) {
++; LA32-LABEL: load_seq_cst_i32:
++; LA32:       # %bb.0:
++; LA32-NEXT:    ld.w $a0, $a0, 0
++; LA32-NEXT:    dbar 0
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: load_seq_cst_i32:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ld.w $a0, $a0, 0
++; LA64-NEXT:    dbar 0
++; LA64-NEXT:    ret
++  %val = load atomic i32, ptr %ptr seq_cst, align 4
++  ret i32 %val
++}
++
++define i64 @load_seq_cst_i64(ptr %ptr) {
++; LA32-LABEL: load_seq_cst_i64:
++; LA32:       # %bb.0:
++; LA32-NEXT:    addi.w $sp, $sp, -16
++; LA32-NEXT:    .cfi_def_cfa_offset 16
++; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
++; LA32-NEXT:    .cfi_offset 1, -4
++; LA32-NEXT:    ori $a1, $zero, 5
++; LA32-NEXT:    bl %plt(__atomic_load_8)
++; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
++; LA32-NEXT:    addi.w $sp, $sp, 16
++; LA32-NEXT:    ret
++;
++; LA64-LABEL: load_seq_cst_i64:
++; LA64:       # %bb.0:
++; LA64-NEXT:    ld.d $a0, $a0, 0
++; LA64-NEXT:    dbar 0
++; LA64-NEXT:    ret
++  %val = load atomic i64, ptr %ptr seq_cst, align 8
++  ret i64 %val
++}
++
+ define void @store_release_i8(ptr %ptr, i8 signext %v) {
+ ; LA32-LABEL: store_release_i8:
+ ; LA32:       # %bb.0:
+-- 
+2.20.1
+
+
+From 0f189600f07f701d96940c2cc52ca762d2be9104 Mon Sep 17 00:00:00 2001
+From: WANG Xuerui <git@xen0n.name>
+Date: Wed, 11 Oct 2023 10:39:13 +0800
+Subject: [PATCH 3/7] [LoongArch] Support finer-grained DBAR hints for LA664+
+ (#68787)
+
+These are treated as DBAR 0 on older uarchs, so we can start to
+unconditionally emit the new hints right away.
+
+Co-authored-by: WANG Rui <wangrui@loongson.cn>
+(cherry picked from commit 956482de13107b640cffedd08610fcccd98f708f)
+---
+ .../LoongArchExpandAtomicPseudoInsts.cpp      |  4 +-
+ .../LoongArch/LoongArchISelLowering.cpp       | 20 +++++++
+ .../Target/LoongArch/LoongArchISelLowering.h  |  1 +
+ .../Target/LoongArch/LoongArchInstrInfo.td    | 24 +++++++-
+ .../LoongArch/atomicrmw-uinc-udec-wrap.ll     | 16 ++---
+ .../ir-instruction/atomic-cmpxchg.ll          | 24 ++++----
+ .../LoongArch/ir-instruction/atomicrmw-fp.ll  | 48 +++++++--------
+ .../ir-instruction/fence-singlethread.ll      |  4 +-
+ .../CodeGen/LoongArch/ir-instruction/fence.ll | 16 ++---
+ .../ir-instruction/load-store-atomic.ll       | 58 +++++++++----------
+ 10 files changed, 129 insertions(+), 86 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp
+index eb78ef065b21..b348cb56c136 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp
+@@ -579,8 +579,8 @@ bool LoongArchExpandAtomicPseudo::expandAtomicCmpXchg(
+   case AtomicOrdering::Acquire:
+   case AtomicOrdering::AcquireRelease:
+   case AtomicOrdering::SequentiallyConsistent:
+-    // TODO: acquire
+-    hint = 0;
++    // acquire
++    hint = 0b10100;
+     break;
+   default:
+     hint = 0x700;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index 5affaf37ad5a..33a3197013cc 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -159,6 +159,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+   // The MULO libcall is not part of libgcc, only compiler-rt.
+   setLibcallName(RTLIB::MULO_I128, nullptr);
+ 
++  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
++
+   static const ISD::CondCode FPCCToExpand[] = {
+       ISD::SETOGT, ISD::SETOGE, ISD::SETUGT, ISD::SETUGE,
+       ISD::SETGE,  ISD::SETNE,  ISD::SETGT};
+@@ -366,6 +368,8 @@ bool LoongArchTargetLowering::isOffsetFoldingLegal(
+ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+   switch (Op.getOpcode()) {
++  case ISD::ATOMIC_FENCE:
++    return lowerATOMIC_FENCE(Op, DAG);
+   case ISD::EH_DWARF_CFA:
+     return lowerEH_DWARF_CFA(Op, DAG);
+   case ISD::GlobalAddress:
+@@ -542,6 +546,22 @@ LoongArchTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
+   return SDValue();
+ }
+ 
++SDValue LoongArchTargetLowering::lowerATOMIC_FENCE(SDValue Op,
++                                                   SelectionDAG &DAG) const {
++  SDLoc DL(Op);
++  SyncScope::ID FenceSSID =
++      static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
++
++  // singlethread fences only synchronize with signal handlers on the same
++  // thread and thus only need to preserve instruction order, not actually
++  // enforce memory ordering.
++  if (FenceSSID == SyncScope::SingleThread)
++    // MEMBARRIER is a compiler barrier; it codegens to a no-op.
++    return DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
++
++  return Op;
++}
++
+ SDValue LoongArchTargetLowering::lowerWRITE_REGISTER(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+index 6b5a851ec55d..23b90640a690 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+@@ -266,6 +266,7 @@ private:
+   MachineBasicBlock *
+   EmitInstrWithCustomInserter(MachineInstr &MI,
+                               MachineBasicBlock *BB) const override;
++  SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
+   SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+index a9b0db30c2f6..fcbd314507a5 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+@@ -1590,7 +1590,29 @@ def : RegRegStPat<store, STX_D, GPR, i64>;
+ 
+ /// Atomic loads and stores
+ 
+-def : Pat<(atomic_fence timm, timm), (DBAR 0)>;
++// DBAR hint encoding for LA664 and later micro-architectures, paraphrased from
++// the Linux patch revealing it [1]:
++//
++// - Bit 4: kind of constraint (0: completion, 1: ordering)
++// - Bit 3: barrier for previous read (0: true, 1: false)
++// - Bit 2: barrier for previous write (0: true, 1: false)
++// - Bit 1: barrier for succeeding read (0: true, 1: false)
++// - Bit 0: barrier for succeeding write (0: true, 1: false)
++//
++// Hint 0x700: barrier for "read after read" from the same address, which is
++// e.g. needed by LL-SC loops on older models. (DBAR 0x700 behaves the same as
++// nop if such reordering is disabled on supporting newer models.)
++//
++// [1]: https://lore.kernel.org/loongarch/20230516124536.535343-1-chenhuacai@loongson.cn/
++//
++// Implementations without support for the finer-granularity hints simply treat
++// all as the full barrier (DBAR 0), so we can unconditionally start emiting the
++// more precise hints right away.
++
++def : Pat<(atomic_fence 4, timm), (DBAR 0b10100)>; // acquire
++def : Pat<(atomic_fence 5, timm), (DBAR 0b10010)>; // release
++def : Pat<(atomic_fence 6, timm), (DBAR 0b10000)>; // acqrel
++def : Pat<(atomic_fence 7, timm), (DBAR 0b10000)>; // seqcst
+ 
+ defm : LdPat<atomic_load_8, LD_B>;
+ defm : LdPat<atomic_load_16, LD_H>;
+diff --git a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
+index 32106886c783..d8908acbc945 100644
+--- a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
++++ b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
+@@ -40,7 +40,7 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
+ ; LA64-NEXT:    b .LBB0_6
+ ; LA64-NEXT:  .LBB0_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB0_1 Depth=1
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB0_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB0_1 Depth=1
+ ; LA64-NEXT:    addi.w $a6, $a3, 0
+@@ -93,7 +93,7 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
+ ; LA64-NEXT:    b .LBB1_6
+ ; LA64-NEXT:  .LBB1_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB1_1 Depth=1
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB1_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB1_1 Depth=1
+ ; LA64-NEXT:    addi.w $a6, $a3, 0
+@@ -133,7 +133,7 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
+ ; LA64-NEXT:    b .LBB2_6
+ ; LA64-NEXT:  .LBB2_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB2_1 Depth=1
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB2_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB2_1 Depth=1
+ ; LA64-NEXT:    move $a3, $a1
+@@ -171,7 +171,7 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
+ ; LA64-NEXT:    b .LBB3_6
+ ; LA64-NEXT:  .LBB3_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB3_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+ ; LA64-NEXT:    bne $a2, $a3, .LBB3_1
+@@ -226,7 +226,7 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
+ ; LA64-NEXT:    b .LBB4_6
+ ; LA64-NEXT:  .LBB4_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB4_1 Depth=1
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB4_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB4_1 Depth=1
+ ; LA64-NEXT:    addi.w $a7, $a3, 0
+@@ -284,7 +284,7 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
+ ; LA64-NEXT:    b .LBB5_6
+ ; LA64-NEXT:  .LBB5_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB5_1 Depth=1
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB5_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB5_1 Depth=1
+ ; LA64-NEXT:    addi.w $a7, $a3, 0
+@@ -329,7 +329,7 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
+ ; LA64-NEXT:    b .LBB6_6
+ ; LA64-NEXT:  .LBB6_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB6_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+ ; LA64-NEXT:    move $a4, $a2
+@@ -372,7 +372,7 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
+ ; LA64-NEXT:    b .LBB7_6
+ ; LA64-NEXT:  .LBB7_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB7_1 Depth=1
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB7_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB7_1 Depth=1
+ ; LA64-NEXT:    bne $a2, $a3, .LBB7_1
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
+index 1ac20d10e587..4f25a1d69af1 100644
+--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
+@@ -27,7 +27,7 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
+ ; LA64-NEXT:    beqz $a5, .LBB0_1
+ ; LA64-NEXT:    b .LBB0_4
+ ; LA64-NEXT:  .LBB0_3:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB0_4:
+ ; LA64-NEXT:    ret
+   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val acquire acquire
+@@ -61,7 +61,7 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
+ ; LA64-NEXT:    beqz $a5, .LBB1_1
+ ; LA64-NEXT:    b .LBB1_4
+ ; LA64-NEXT:  .LBB1_3:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB1_4:
+ ; LA64-NEXT:    ret
+   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val acquire acquire
+@@ -80,7 +80,7 @@ define void @cmpxchg_i32_acquire_acquire(ptr %ptr, i32 %cmp, i32 %val) nounwind
+ ; LA64-NEXT:    beqz $a4, .LBB2_1
+ ; LA64-NEXT:    b .LBB2_4
+ ; LA64-NEXT:  .LBB2_3:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB2_4:
+ ; LA64-NEXT:    ret
+   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire acquire
+@@ -99,7 +99,7 @@ define void @cmpxchg_i64_acquire_acquire(ptr %ptr, i64 %cmp, i64 %val) nounwind
+ ; LA64-NEXT:    beqz $a4, .LBB3_1
+ ; LA64-NEXT:    b .LBB3_4
+ ; LA64-NEXT:  .LBB3_3:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB3_4:
+ ; LA64-NEXT:    ret
+   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val acquire acquire
+@@ -132,7 +132,7 @@ define i8 @cmpxchg_i8_acquire_acquire_reti8(ptr %ptr, i8 %cmp, i8 %val) nounwind
+ ; LA64-NEXT:    beqz $a6, .LBB4_1
+ ; LA64-NEXT:    b .LBB4_4
+ ; LA64-NEXT:  .LBB4_3:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB4_4:
+ ; LA64-NEXT:    srl.w $a0, $a5, $a0
+ ; LA64-NEXT:    ret
+@@ -168,7 +168,7 @@ define i16 @cmpxchg_i16_acquire_acquire_reti16(ptr %ptr, i16 %cmp, i16 %val) nou
+ ; LA64-NEXT:    beqz $a6, .LBB5_1
+ ; LA64-NEXT:    b .LBB5_4
+ ; LA64-NEXT:  .LBB5_3:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB5_4:
+ ; LA64-NEXT:    srl.w $a0, $a5, $a0
+ ; LA64-NEXT:    ret
+@@ -189,7 +189,7 @@ define i32 @cmpxchg_i32_acquire_acquire_reti32(ptr %ptr, i32 %cmp, i32 %val) nou
+ ; LA64-NEXT:    beqz $a4, .LBB6_1
+ ; LA64-NEXT:    b .LBB6_4
+ ; LA64-NEXT:  .LBB6_3:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB6_4:
+ ; LA64-NEXT:    move $a0, $a3
+ ; LA64-NEXT:    ret
+@@ -210,7 +210,7 @@ define i64 @cmpxchg_i64_acquire_acquire_reti64(ptr %ptr, i64 %cmp, i64 %val) nou
+ ; LA64-NEXT:    beqz $a4, .LBB7_1
+ ; LA64-NEXT:    b .LBB7_4
+ ; LA64-NEXT:  .LBB7_3:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB7_4:
+ ; LA64-NEXT:    move $a0, $a3
+ ; LA64-NEXT:    ret
+@@ -245,7 +245,7 @@ define i1 @cmpxchg_i8_acquire_acquire_reti1(ptr %ptr, i8 %cmp, i8 %val) nounwind
+ ; LA64-NEXT:    beqz $a6, .LBB8_1
+ ; LA64-NEXT:    b .LBB8_4
+ ; LA64-NEXT:  .LBB8_3:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB8_4:
+ ; LA64-NEXT:    and $a0, $a5, $a4
+ ; LA64-NEXT:    addi.w $a0, $a0, 0
+@@ -284,7 +284,7 @@ define i1 @cmpxchg_i16_acquire_acquire_reti1(ptr %ptr, i16 %cmp, i16 %val) nounw
+ ; LA64-NEXT:    beqz $a6, .LBB9_1
+ ; LA64-NEXT:    b .LBB9_4
+ ; LA64-NEXT:  .LBB9_3:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB9_4:
+ ; LA64-NEXT:    and $a0, $a5, $a4
+ ; LA64-NEXT:    addi.w $a0, $a0, 0
+@@ -308,7 +308,7 @@ define i1 @cmpxchg_i32_acquire_acquire_reti1(ptr %ptr, i32 %cmp, i32 %val) nounw
+ ; LA64-NEXT:    beqz $a4, .LBB10_1
+ ; LA64-NEXT:    b .LBB10_4
+ ; LA64-NEXT:  .LBB10_3:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB10_4:
+ ; LA64-NEXT:    addi.w $a0, $a1, 0
+ ; LA64-NEXT:    xor $a0, $a3, $a0
+@@ -331,7 +331,7 @@ define i1 @cmpxchg_i64_acquire_acquire_reti1(ptr %ptr, i64 %cmp, i64 %val) nounw
+ ; LA64-NEXT:    beqz $a4, .LBB11_1
+ ; LA64-NEXT:    b .LBB11_4
+ ; LA64-NEXT:  .LBB11_3:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB11_4:
+ ; LA64-NEXT:    xor $a0, $a3, $a1
+ ; LA64-NEXT:    sltui $a0, $a0, 1
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+index 02d481cb3865..589360823b14 100644
+--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+@@ -29,7 +29,7 @@ define float @float_fadd_acquire(ptr %p) nounwind {
+ ; LA64F-NEXT:    b .LBB0_6
+ ; LA64F-NEXT:  .LBB0_5: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB0_1 Depth=1
+-; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:    dbar 20
+ ; LA64F-NEXT:  .LBB0_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB0_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+@@ -64,7 +64,7 @@ define float @float_fadd_acquire(ptr %p) nounwind {
+ ; LA64D-NEXT:    b .LBB0_6
+ ; LA64D-NEXT:  .LBB0_5: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB0_1 Depth=1
+-; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:    dbar 20
+ ; LA64D-NEXT:  .LBB0_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB0_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+@@ -103,7 +103,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
+ ; LA64F-NEXT:    b .LBB1_6
+ ; LA64F-NEXT:  .LBB1_5: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB1_1 Depth=1
+-; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:    dbar 20
+ ; LA64F-NEXT:  .LBB1_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB1_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+@@ -138,7 +138,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
+ ; LA64D-NEXT:    b .LBB1_6
+ ; LA64D-NEXT:  .LBB1_5: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB1_1 Depth=1
+-; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:    dbar 20
+ ; LA64D-NEXT:  .LBB1_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB1_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+@@ -178,7 +178,7 @@ define float @float_fmin_acquire(ptr %p) nounwind {
+ ; LA64F-NEXT:    b .LBB2_6
+ ; LA64F-NEXT:  .LBB2_5: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB2_1 Depth=1
+-; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:    dbar 20
+ ; LA64F-NEXT:  .LBB2_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB2_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+@@ -214,7 +214,7 @@ define float @float_fmin_acquire(ptr %p) nounwind {
+ ; LA64D-NEXT:    b .LBB2_6
+ ; LA64D-NEXT:  .LBB2_5: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB2_1 Depth=1
+-; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:    dbar 20
+ ; LA64D-NEXT:  .LBB2_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB2_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+@@ -254,7 +254,7 @@ define float @float_fmax_acquire(ptr %p) nounwind {
+ ; LA64F-NEXT:    b .LBB3_6
+ ; LA64F-NEXT:  .LBB3_5: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB3_1 Depth=1
+-; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:    dbar 20
+ ; LA64F-NEXT:  .LBB3_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB3_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+@@ -290,7 +290,7 @@ define float @float_fmax_acquire(ptr %p) nounwind {
+ ; LA64D-NEXT:    b .LBB3_6
+ ; LA64D-NEXT:  .LBB3_5: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB3_1 Depth=1
+-; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:    dbar 20
+ ; LA64D-NEXT:  .LBB3_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB3_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+@@ -1385,7 +1385,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind {
+ ; LA64F-NEXT:    b .LBB16_6
+ ; LA64F-NEXT:  .LBB16_5: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB16_1 Depth=1
+-; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:    dbar 20
+ ; LA64F-NEXT:  .LBB16_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB16_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+@@ -1420,7 +1420,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind {
+ ; LA64D-NEXT:    b .LBB16_6
+ ; LA64D-NEXT:  .LBB16_5: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB16_1 Depth=1
+-; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:    dbar 20
+ ; LA64D-NEXT:  .LBB16_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB16_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+@@ -1459,7 +1459,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
+ ; LA64F-NEXT:    b .LBB17_6
+ ; LA64F-NEXT:  .LBB17_5: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB17_1 Depth=1
+-; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:    dbar 20
+ ; LA64F-NEXT:  .LBB17_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB17_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+@@ -1494,7 +1494,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
+ ; LA64D-NEXT:    b .LBB17_6
+ ; LA64D-NEXT:  .LBB17_5: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB17_1 Depth=1
+-; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:    dbar 20
+ ; LA64D-NEXT:  .LBB17_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB17_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+@@ -1534,7 +1534,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind {
+ ; LA64F-NEXT:    b .LBB18_6
+ ; LA64F-NEXT:  .LBB18_5: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB18_1 Depth=1
+-; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:    dbar 20
+ ; LA64F-NEXT:  .LBB18_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB18_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+@@ -1570,7 +1570,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind {
+ ; LA64D-NEXT:    b .LBB18_6
+ ; LA64D-NEXT:  .LBB18_5: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB18_1 Depth=1
+-; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:    dbar 20
+ ; LA64D-NEXT:  .LBB18_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB18_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+@@ -1610,7 +1610,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind {
+ ; LA64F-NEXT:    b .LBB19_6
+ ; LA64F-NEXT:  .LBB19_5: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB19_1 Depth=1
+-; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:    dbar 20
+ ; LA64F-NEXT:  .LBB19_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB19_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+@@ -1646,7 +1646,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind {
+ ; LA64D-NEXT:    b .LBB19_6
+ ; LA64D-NEXT:  .LBB19_5: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB19_1 Depth=1
+-; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:    dbar 20
+ ; LA64D-NEXT:  .LBB19_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB19_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+@@ -2087,7 +2087,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind {
+ ; LA64F-NEXT:    b .LBB24_6
+ ; LA64F-NEXT:  .LBB24_5: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB24_1 Depth=1
+-; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:    dbar 20
+ ; LA64F-NEXT:  .LBB24_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB24_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+@@ -2122,7 +2122,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind {
+ ; LA64D-NEXT:    b .LBB24_6
+ ; LA64D-NEXT:  .LBB24_5: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB24_1 Depth=1
+-; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:    dbar 20
+ ; LA64D-NEXT:  .LBB24_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB24_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+@@ -2161,7 +2161,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
+ ; LA64F-NEXT:    b .LBB25_6
+ ; LA64F-NEXT:  .LBB25_5: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB25_1 Depth=1
+-; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:    dbar 20
+ ; LA64F-NEXT:  .LBB25_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB25_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+@@ -2196,7 +2196,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
+ ; LA64D-NEXT:    b .LBB25_6
+ ; LA64D-NEXT:  .LBB25_5: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB25_1 Depth=1
+-; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:    dbar 20
+ ; LA64D-NEXT:  .LBB25_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB25_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+@@ -2236,7 +2236,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind {
+ ; LA64F-NEXT:    b .LBB26_6
+ ; LA64F-NEXT:  .LBB26_5: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB26_1 Depth=1
+-; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:    dbar 20
+ ; LA64F-NEXT:  .LBB26_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB26_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+@@ -2272,7 +2272,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind {
+ ; LA64D-NEXT:    b .LBB26_6
+ ; LA64D-NEXT:  .LBB26_5: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB26_1 Depth=1
+-; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:    dbar 20
+ ; LA64D-NEXT:  .LBB26_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB26_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+@@ -2312,7 +2312,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind {
+ ; LA64F-NEXT:    b .LBB27_6
+ ; LA64F-NEXT:  .LBB27_5: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB27_1 Depth=1
+-; LA64F-NEXT:    dbar 0
++; LA64F-NEXT:    dbar 20
+ ; LA64F-NEXT:  .LBB27_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB27_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+@@ -2348,7 +2348,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind {
+ ; LA64D-NEXT:    b .LBB27_6
+ ; LA64D-NEXT:  .LBB27_5: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB27_1 Depth=1
+-; LA64D-NEXT:    dbar 0
++; LA64D-NEXT:    dbar 20
+ ; LA64D-NEXT:  .LBB27_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB27_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/fence-singlethread.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/fence-singlethread.ll
+index 8d6056bc7677..a8b164a4cd3c 100644
+--- a/llvm/test/CodeGen/LoongArch/ir-instruction/fence-singlethread.ll
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/fence-singlethread.ll
+@@ -5,12 +5,12 @@
+ define void @fence_singlethread() {
+ ; LA32-LABEL: fence_singlethread:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    #MEMBARRIER
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: fence_singlethread:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    #MEMBARRIER
+ ; LA64-NEXT:    ret
+   fence syncscope("singlethread") seq_cst
+   ret void
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/fence.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/fence.ll
+index 724639f3c6fb..c5b2232f9b80 100644
+--- a/llvm/test/CodeGen/LoongArch/ir-instruction/fence.ll
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/fence.ll
+@@ -5,12 +5,12 @@
+ define void @fence_acquire() nounwind {
+ ; LA32-LABEL: fence_acquire:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 20
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: fence_acquire:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:    ret
+   fence acquire
+   ret void
+@@ -19,12 +19,12 @@ define void @fence_acquire() nounwind {
+ define void @fence_release() nounwind {
+ ; LA32-LABEL: fence_release:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 18
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: fence_release:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 18
+ ; LA64-NEXT:    ret
+   fence release
+   ret void
+@@ -33,12 +33,12 @@ define void @fence_release() nounwind {
+ define void @fence_acq_rel() nounwind {
+ ; LA32-LABEL: fence_acq_rel:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 16
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: fence_acq_rel:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 16
+ ; LA64-NEXT:    ret
+   fence acq_rel
+   ret void
+@@ -47,12 +47,12 @@ define void @fence_acq_rel() nounwind {
+ define void @fence_seq_cst() nounwind {
+ ; LA32-LABEL: fence_seq_cst:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 16
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: fence_seq_cst:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 16
+ ; LA64-NEXT:    ret
+   fence seq_cst
+   ret void
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll
+index deff11723d27..8b170c479eed 100644
+--- a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store-atomic.ll
+@@ -6,13 +6,13 @@ define i8 @load_acquire_i8(ptr %ptr) {
+ ; LA32-LABEL: load_acquire_i8:
+ ; LA32:       # %bb.0:
+ ; LA32-NEXT:    ld.b $a0, $a0, 0
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 20
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: load_acquire_i8:
+ ; LA64:       # %bb.0:
+ ; LA64-NEXT:    ld.b $a0, $a0, 0
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:    ret
+   %val = load atomic i8, ptr %ptr acquire, align 1
+   ret i8 %val
+@@ -22,13 +22,13 @@ define i16 @load_acquire_i16(ptr %ptr) {
+ ; LA32-LABEL: load_acquire_i16:
+ ; LA32:       # %bb.0:
+ ; LA32-NEXT:    ld.h $a0, $a0, 0
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 20
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: load_acquire_i16:
+ ; LA64:       # %bb.0:
+ ; LA64-NEXT:    ld.h $a0, $a0, 0
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:    ret
+   %val = load atomic i16, ptr %ptr acquire, align 2
+   ret i16 %val
+@@ -38,13 +38,13 @@ define i32 @load_acquire_i32(ptr %ptr) {
+ ; LA32-LABEL: load_acquire_i32:
+ ; LA32:       # %bb.0:
+ ; LA32-NEXT:    ld.w $a0, $a0, 0
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 20
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: load_acquire_i32:
+ ; LA64:       # %bb.0:
+ ; LA64-NEXT:    ld.w $a0, $a0, 0
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:    ret
+   %val = load atomic i32, ptr %ptr acquire, align 4
+   ret i32 %val
+@@ -66,7 +66,7 @@ define i64 @load_acquire_i64(ptr %ptr) {
+ ; LA64-LABEL: load_acquire_i64:
+ ; LA64:       # %bb.0:
+ ; LA64-NEXT:    ld.d $a0, $a0, 0
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:    ret
+   %val = load atomic i64, ptr %ptr acquire, align 8
+   ret i64 %val
+@@ -202,13 +202,13 @@ define i8 @load_seq_cst_i8(ptr %ptr) {
+ ; LA32-LABEL: load_seq_cst_i8:
+ ; LA32:       # %bb.0:
+ ; LA32-NEXT:    ld.b $a0, $a0, 0
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 16
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: load_seq_cst_i8:
+ ; LA64:       # %bb.0:
+ ; LA64-NEXT:    ld.b $a0, $a0, 0
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 16
+ ; LA64-NEXT:    ret
+   %val = load atomic i8, ptr %ptr seq_cst, align 1
+   ret i8 %val
+@@ -218,13 +218,13 @@ define i16 @load_seq_cst_i16(ptr %ptr) {
+ ; LA32-LABEL: load_seq_cst_i16:
+ ; LA32:       # %bb.0:
+ ; LA32-NEXT:    ld.h $a0, $a0, 0
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 16
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: load_seq_cst_i16:
+ ; LA64:       # %bb.0:
+ ; LA64-NEXT:    ld.h $a0, $a0, 0
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 16
+ ; LA64-NEXT:    ret
+   %val = load atomic i16, ptr %ptr seq_cst, align 2
+   ret i16 %val
+@@ -234,13 +234,13 @@ define i32 @load_seq_cst_i32(ptr %ptr) {
+ ; LA32-LABEL: load_seq_cst_i32:
+ ; LA32:       # %bb.0:
+ ; LA32-NEXT:    ld.w $a0, $a0, 0
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 16
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: load_seq_cst_i32:
+ ; LA64:       # %bb.0:
+ ; LA64-NEXT:    ld.w $a0, $a0, 0
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 16
+ ; LA64-NEXT:    ret
+   %val = load atomic i32, ptr %ptr seq_cst, align 4
+   ret i32 %val
+@@ -262,7 +262,7 @@ define i64 @load_seq_cst_i64(ptr %ptr) {
+ ; LA64-LABEL: load_seq_cst_i64:
+ ; LA64:       # %bb.0:
+ ; LA64-NEXT:    ld.d $a0, $a0, 0
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 16
+ ; LA64-NEXT:    ret
+   %val = load atomic i64, ptr %ptr seq_cst, align 8
+   ret i64 %val
+@@ -271,13 +271,13 @@ define i64 @load_seq_cst_i64(ptr %ptr) {
+ define void @store_release_i8(ptr %ptr, i8 signext %v) {
+ ; LA32-LABEL: store_release_i8:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 18
+ ; LA32-NEXT:    st.b $a1, $a0, 0
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: store_release_i8:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 18
+ ; LA64-NEXT:    st.b $a1, $a0, 0
+ ; LA64-NEXT:    ret
+   store atomic i8 %v, ptr %ptr release, align 1
+@@ -287,13 +287,13 @@ define void @store_release_i8(ptr %ptr, i8 signext %v) {
+ define void @store_release_i16(ptr %ptr, i16 signext %v) {
+ ; LA32-LABEL: store_release_i16:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 18
+ ; LA32-NEXT:    st.h $a1, $a0, 0
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: store_release_i16:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 18
+ ; LA64-NEXT:    st.h $a1, $a0, 0
+ ; LA64-NEXT:    ret
+   store atomic i16 %v, ptr %ptr release, align 2
+@@ -303,7 +303,7 @@ define void @store_release_i16(ptr %ptr, i16 signext %v) {
+ define void @store_release_i32(ptr %ptr, i32 signext %v) {
+ ; LA32-LABEL: store_release_i32:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 18
+ ; LA32-NEXT:    st.w $a1, $a0, 0
+ ; LA32-NEXT:    ret
+ ;
+@@ -465,16 +465,16 @@ define void @store_monotonic_i64(ptr %ptr, i64 %v) {
+ define void @store_seq_cst_i8(ptr %ptr, i8 signext %v) {
+ ; LA32-LABEL: store_seq_cst_i8:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 16
+ ; LA32-NEXT:    st.b $a1, $a0, 0
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 16
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: store_seq_cst_i8:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 16
+ ; LA64-NEXT:    st.b $a1, $a0, 0
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 16
+ ; LA64-NEXT:    ret
+   store atomic i8 %v, ptr %ptr seq_cst, align 1
+   ret void
+@@ -483,16 +483,16 @@ define void @store_seq_cst_i8(ptr %ptr, i8 signext %v) {
+ define void @store_seq_cst_i16(ptr %ptr, i16 signext %v) {
+ ; LA32-LABEL: store_seq_cst_i16:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 16
+ ; LA32-NEXT:    st.h $a1, $a0, 0
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 16
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: store_seq_cst_i16:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 16
+ ; LA64-NEXT:    st.h $a1, $a0, 0
+-; LA64-NEXT:    dbar 0
++; LA64-NEXT:    dbar 16
+ ; LA64-NEXT:    ret
+   store atomic i16 %v, ptr %ptr seq_cst, align 2
+   ret void
+@@ -501,9 +501,9 @@ define void @store_seq_cst_i16(ptr %ptr, i16 signext %v) {
+ define void @store_seq_cst_i32(ptr %ptr, i32 signext %v) {
+ ; LA32-LABEL: store_seq_cst_i32:
+ ; LA32:       # %bb.0:
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 16
+ ; LA32-NEXT:    st.w $a1, $a0, 0
+-; LA32-NEXT:    dbar 0
++; LA32-NEXT:    dbar 16
+ ; LA32-NEXT:    ret
+ ;
+ ; LA64-LABEL: store_seq_cst_i32:
+-- 
+2.20.1
+
+
+From 7e37560cddfa108426ff9f87871c71ed01e7596e Mon Sep 17 00:00:00 2001
+From: Weining Lu <luweining@loongson.cn>
+Date: Tue, 17 Oct 2023 17:41:32 +0800
+Subject: [PATCH 4/7] [LoongArch] Precommit a test for atomic cmpxchg
+ optmization
+
+(cherry picked from commit b2773d170cb4bdb4b19ba801b5eb55395024b3ae)
+---
+ .../ir-instruction/atomic-cmpxchg.ll          | 385 +++++++++++-------
+ 1 file changed, 245 insertions(+), 140 deletions(-)
+
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
+index 4f25a1d69af1..174bb9d0ff7d 100644
+--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
+@@ -106,6 +106,111 @@ define void @cmpxchg_i64_acquire_acquire(ptr %ptr, i64 %cmp, i64 %val) nounwind
+   ret void
+ }
+ 
++define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
++; LA64-LABEL: cmpxchg_i8_acquire_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a3, $zero, -4
++; LA64-NEXT:    and $a3, $a0, $a3
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    andi $a1, $a1, 255
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    andi $a2, $a2, 255
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    ori $a4, $zero, 255
++; LA64-NEXT:    sll.w $a0, $a4, $a0
++; LA64-NEXT:    addi.w $a0, $a0, 0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a3, 0
++; LA64-NEXT:    and $a5, $a4, $a0
++; LA64-NEXT:    bne $a5, $a1, .LBB4_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB4_1 Depth=1
++; LA64-NEXT:    andn $a5, $a4, $a0
++; LA64-NEXT:    or $a5, $a5, $a2
++; LA64-NEXT:    sc.w $a5, $a3, 0
++; LA64-NEXT:    beqz $a5, .LBB4_1
++; LA64-NEXT:    b .LBB4_4
++; LA64-NEXT:  .LBB4_3:
++; LA64-NEXT:    dbar 20
++; LA64-NEXT:  .LBB4_4:
++; LA64-NEXT:    ret
++  %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val acquire monotonic
++  ret void
++}
++
++define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwind {
++; LA64-LABEL: cmpxchg_i16_acquire_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a3, $zero, -4
++; LA64-NEXT:    and $a3, $a0, $a3
++; LA64-NEXT:    slli.d $a0, $a0, 3
++; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
++; LA64-NEXT:    sll.w $a1, $a1, $a0
++; LA64-NEXT:    bstrpick.d $a2, $a2, 15, 0
++; LA64-NEXT:    sll.w $a2, $a2, $a0
++; LA64-NEXT:    lu12i.w $a4, 15
++; LA64-NEXT:    ori $a4, $a4, 4095
++; LA64-NEXT:    sll.w $a0, $a4, $a0
++; LA64-NEXT:    addi.w $a0, $a0, 0
++; LA64-NEXT:    addi.w $a2, $a2, 0
++; LA64-NEXT:    addi.w $a1, $a1, 0
++; LA64-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a4, $a3, 0
++; LA64-NEXT:    and $a5, $a4, $a0
++; LA64-NEXT:    bne $a5, $a1, .LBB5_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB5_1 Depth=1
++; LA64-NEXT:    andn $a5, $a4, $a0
++; LA64-NEXT:    or $a5, $a5, $a2
++; LA64-NEXT:    sc.w $a5, $a3, 0
++; LA64-NEXT:    beqz $a5, .LBB5_1
++; LA64-NEXT:    b .LBB5_4
++; LA64-NEXT:  .LBB5_3:
++; LA64-NEXT:    dbar 20
++; LA64-NEXT:  .LBB5_4:
++; LA64-NEXT:    ret
++  %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val acquire monotonic
++  ret void
++}
++
++define void @cmpxchg_i32_acquire_monotonic(ptr %ptr, i32 %cmp, i32 %val) nounwind {
++; LA64-LABEL: cmpxchg_i32_acquire_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.w $a3, $a0, 0
++; LA64-NEXT:    bne $a3, $a1, .LBB6_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB6_1 Depth=1
++; LA64-NEXT:    move $a4, $a2
++; LA64-NEXT:    sc.w $a4, $a0, 0
++; LA64-NEXT:    beqz $a4, .LBB6_1
++; LA64-NEXT:    b .LBB6_4
++; LA64-NEXT:  .LBB6_3:
++; LA64-NEXT:    dbar 20
++; LA64-NEXT:  .LBB6_4:
++; LA64-NEXT:    ret
++  %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire monotonic
++  ret void
++}
++
++define void @cmpxchg_i64_acquire_monotonic(ptr %ptr, i64 %cmp, i64 %val) nounwind {
++; LA64-LABEL: cmpxchg_i64_acquire_monotonic:
++; LA64:       # %bb.0:
++; LA64-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:    ll.d $a3, $a0, 0
++; LA64-NEXT:    bne $a3, $a1, .LBB7_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB7_1 Depth=1
++; LA64-NEXT:    move $a4, $a2
++; LA64-NEXT:    sc.d $a4, $a0, 0
++; LA64-NEXT:    beqz $a4, .LBB7_1
++; LA64-NEXT:    b .LBB7_4
++; LA64-NEXT:  .LBB7_3:
++; LA64-NEXT:    dbar 20
++; LA64-NEXT:  .LBB7_4:
++; LA64-NEXT:    ret
++  %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val acquire monotonic
++  ret void
++}
++
+ define i8 @cmpxchg_i8_acquire_acquire_reti8(ptr %ptr, i8 %cmp, i8 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i8_acquire_acquire_reti8:
+ ; LA64:       # %bb.0:
+@@ -121,19 +226,19 @@ define i8 @cmpxchg_i8_acquire_acquire_reti8(ptr %ptr, i8 %cmp, i8 %val) nounwind
+ ; LA64-NEXT:    andi $a1, $a1, 255
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+-; LA64-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a5, $a3, 0
+ ; LA64-NEXT:    and $a6, $a5, $a4
+-; LA64-NEXT:    bne $a6, $a1, .LBB4_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB4_1 Depth=1
++; LA64-NEXT:    bne $a6, $a1, .LBB8_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB8_1 Depth=1
+ ; LA64-NEXT:    andn $a6, $a5, $a4
+ ; LA64-NEXT:    or $a6, $a6, $a2
+ ; LA64-NEXT:    sc.w $a6, $a3, 0
+-; LA64-NEXT:    beqz $a6, .LBB4_1
+-; LA64-NEXT:    b .LBB4_4
+-; LA64-NEXT:  .LBB4_3:
++; LA64-NEXT:    beqz $a6, .LBB8_1
++; LA64-NEXT:    b .LBB8_4
++; LA64-NEXT:  .LBB8_3:
+ ; LA64-NEXT:    dbar 20
+-; LA64-NEXT:  .LBB4_4:
++; LA64-NEXT:  .LBB8_4:
+ ; LA64-NEXT:    srl.w $a0, $a5, $a0
+ ; LA64-NEXT:    ret
+   %tmp = cmpxchg ptr %ptr, i8 %cmp, i8 %val acquire acquire
+@@ -157,19 +262,19 @@ define i16 @cmpxchg_i16_acquire_acquire_reti16(ptr %ptr, i16 %cmp, i16 %val) nou
+ ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+-; LA64-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a5, $a3, 0
+ ; LA64-NEXT:    and $a6, $a5, $a4
+-; LA64-NEXT:    bne $a6, $a1, .LBB5_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB5_1 Depth=1
++; LA64-NEXT:    bne $a6, $a1, .LBB9_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB9_1 Depth=1
+ ; LA64-NEXT:    andn $a6, $a5, $a4
+ ; LA64-NEXT:    or $a6, $a6, $a2
+ ; LA64-NEXT:    sc.w $a6, $a3, 0
+-; LA64-NEXT:    beqz $a6, .LBB5_1
+-; LA64-NEXT:    b .LBB5_4
+-; LA64-NEXT:  .LBB5_3:
++; LA64-NEXT:    beqz $a6, .LBB9_1
++; LA64-NEXT:    b .LBB9_4
++; LA64-NEXT:  .LBB9_3:
+ ; LA64-NEXT:    dbar 20
+-; LA64-NEXT:  .LBB5_4:
++; LA64-NEXT:  .LBB9_4:
+ ; LA64-NEXT:    srl.w $a0, $a5, $a0
+ ; LA64-NEXT:    ret
+   %tmp = cmpxchg ptr %ptr, i16 %cmp, i16 %val acquire acquire
+@@ -180,17 +285,17 @@ define i16 @cmpxchg_i16_acquire_acquire_reti16(ptr %ptr, i16 %cmp, i16 %val) nou
+ define i32 @cmpxchg_i32_acquire_acquire_reti32(ptr %ptr, i32 %cmp, i32 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i32_acquire_acquire_reti32:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a3, $a0, 0
+-; LA64-NEXT:    bne $a3, $a1, .LBB6_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB6_1 Depth=1
++; LA64-NEXT:    bne $a3, $a1, .LBB10_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB10_1 Depth=1
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.w $a4, $a0, 0
+-; LA64-NEXT:    beqz $a4, .LBB6_1
+-; LA64-NEXT:    b .LBB6_4
+-; LA64-NEXT:  .LBB6_3:
++; LA64-NEXT:    beqz $a4, .LBB10_1
++; LA64-NEXT:    b .LBB10_4
++; LA64-NEXT:  .LBB10_3:
+ ; LA64-NEXT:    dbar 20
+-; LA64-NEXT:  .LBB6_4:
++; LA64-NEXT:  .LBB10_4:
+ ; LA64-NEXT:    move $a0, $a3
+ ; LA64-NEXT:    ret
+   %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire acquire
+@@ -201,17 +306,17 @@ define i32 @cmpxchg_i32_acquire_acquire_reti32(ptr %ptr, i32 %cmp, i32 %val) nou
+ define i64 @cmpxchg_i64_acquire_acquire_reti64(ptr %ptr, i64 %cmp, i64 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i64_acquire_acquire_reti64:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.d $a3, $a0, 0
+-; LA64-NEXT:    bne $a3, $a1, .LBB7_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB7_1 Depth=1
++; LA64-NEXT:    bne $a3, $a1, .LBB11_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB11_1 Depth=1
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.d $a4, $a0, 0
+-; LA64-NEXT:    beqz $a4, .LBB7_1
+-; LA64-NEXT:    b .LBB7_4
+-; LA64-NEXT:  .LBB7_3:
++; LA64-NEXT:    beqz $a4, .LBB11_1
++; LA64-NEXT:    b .LBB11_4
++; LA64-NEXT:  .LBB11_3:
+ ; LA64-NEXT:    dbar 20
+-; LA64-NEXT:  .LBB7_4:
++; LA64-NEXT:  .LBB11_4:
+ ; LA64-NEXT:    move $a0, $a3
+ ; LA64-NEXT:    ret
+   %tmp = cmpxchg ptr %ptr, i64 %cmp, i64 %val acquire acquire
+@@ -234,19 +339,19 @@ define i1 @cmpxchg_i8_acquire_acquire_reti1(ptr %ptr, i8 %cmp, i8 %val) nounwind
+ ; LA64-NEXT:    addi.w $a0, $a0, 0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:    addi.w $a2, $a4, 0
+-; LA64-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a5, $a3, 0
+ ; LA64-NEXT:    and $a6, $a5, $a2
+-; LA64-NEXT:    bne $a6, $a1, .LBB8_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB8_1 Depth=1
++; LA64-NEXT:    bne $a6, $a1, .LBB12_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB12_1 Depth=1
+ ; LA64-NEXT:    andn $a6, $a5, $a2
+ ; LA64-NEXT:    or $a6, $a6, $a0
+ ; LA64-NEXT:    sc.w $a6, $a3, 0
+-; LA64-NEXT:    beqz $a6, .LBB8_1
+-; LA64-NEXT:    b .LBB8_4
+-; LA64-NEXT:  .LBB8_3:
++; LA64-NEXT:    beqz $a6, .LBB12_1
++; LA64-NEXT:    b .LBB12_4
++; LA64-NEXT:  .LBB12_3:
+ ; LA64-NEXT:    dbar 20
+-; LA64-NEXT:  .LBB8_4:
++; LA64-NEXT:  .LBB12_4:
+ ; LA64-NEXT:    and $a0, $a5, $a4
+ ; LA64-NEXT:    addi.w $a0, $a0, 0
+ ; LA64-NEXT:    xor $a0, $a1, $a0
+@@ -273,19 +378,19 @@ define i1 @cmpxchg_i16_acquire_acquire_reti1(ptr %ptr, i16 %cmp, i16 %val) nounw
+ ; LA64-NEXT:    addi.w $a0, $a0, 0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:    addi.w $a2, $a4, 0
+-; LA64-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a5, $a3, 0
+ ; LA64-NEXT:    and $a6, $a5, $a2
+-; LA64-NEXT:    bne $a6, $a1, .LBB9_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB9_1 Depth=1
++; LA64-NEXT:    bne $a6, $a1, .LBB13_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB13_1 Depth=1
+ ; LA64-NEXT:    andn $a6, $a5, $a2
+ ; LA64-NEXT:    or $a6, $a6, $a0
+ ; LA64-NEXT:    sc.w $a6, $a3, 0
+-; LA64-NEXT:    beqz $a6, .LBB9_1
+-; LA64-NEXT:    b .LBB9_4
+-; LA64-NEXT:  .LBB9_3:
++; LA64-NEXT:    beqz $a6, .LBB13_1
++; LA64-NEXT:    b .LBB13_4
++; LA64-NEXT:  .LBB13_3:
+ ; LA64-NEXT:    dbar 20
+-; LA64-NEXT:  .LBB9_4:
++; LA64-NEXT:  .LBB13_4:
+ ; LA64-NEXT:    and $a0, $a5, $a4
+ ; LA64-NEXT:    addi.w $a0, $a0, 0
+ ; LA64-NEXT:    xor $a0, $a1, $a0
+@@ -299,17 +404,17 @@ define i1 @cmpxchg_i16_acquire_acquire_reti1(ptr %ptr, i16 %cmp, i16 %val) nounw
+ define i1 @cmpxchg_i32_acquire_acquire_reti1(ptr %ptr, i32 %cmp, i32 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i32_acquire_acquire_reti1:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a3, $a0, 0
+-; LA64-NEXT:    bne $a3, $a1, .LBB10_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB10_1 Depth=1
++; LA64-NEXT:    bne $a3, $a1, .LBB14_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB14_1 Depth=1
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.w $a4, $a0, 0
+-; LA64-NEXT:    beqz $a4, .LBB10_1
+-; LA64-NEXT:    b .LBB10_4
+-; LA64-NEXT:  .LBB10_3:
++; LA64-NEXT:    beqz $a4, .LBB14_1
++; LA64-NEXT:    b .LBB14_4
++; LA64-NEXT:  .LBB14_3:
+ ; LA64-NEXT:    dbar 20
+-; LA64-NEXT:  .LBB10_4:
++; LA64-NEXT:  .LBB14_4:
+ ; LA64-NEXT:    addi.w $a0, $a1, 0
+ ; LA64-NEXT:    xor $a0, $a3, $a0
+ ; LA64-NEXT:    sltui $a0, $a0, 1
+@@ -322,17 +427,17 @@ define i1 @cmpxchg_i32_acquire_acquire_reti1(ptr %ptr, i32 %cmp, i32 %val) nounw
+ define i1 @cmpxchg_i64_acquire_acquire_reti1(ptr %ptr, i64 %cmp, i64 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i64_acquire_acquire_reti1:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.d $a3, $a0, 0
+-; LA64-NEXT:    bne $a3, $a1, .LBB11_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB11_1 Depth=1
++; LA64-NEXT:    bne $a3, $a1, .LBB15_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB15_1 Depth=1
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.d $a4, $a0, 0
+-; LA64-NEXT:    beqz $a4, .LBB11_1
+-; LA64-NEXT:    b .LBB11_4
+-; LA64-NEXT:  .LBB11_3:
++; LA64-NEXT:    beqz $a4, .LBB15_1
++; LA64-NEXT:    b .LBB15_4
++; LA64-NEXT:  .LBB15_3:
+ ; LA64-NEXT:    dbar 20
+-; LA64-NEXT:  .LBB11_4:
++; LA64-NEXT:  .LBB15_4:
+ ; LA64-NEXT:    xor $a0, $a3, $a1
+ ; LA64-NEXT:    sltui $a0, $a0, 1
+ ; LA64-NEXT:    ret
+@@ -356,19 +461,19 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind
+ ; LA64-NEXT:    addi.w $a0, $a0, 0
+ ; LA64-NEXT:    addi.w $a2, $a2, 0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+-; LA64-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a4, $a3, 0
+ ; LA64-NEXT:    and $a5, $a4, $a0
+-; LA64-NEXT:    bne $a5, $a1, .LBB12_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB12_1 Depth=1
++; LA64-NEXT:    bne $a5, $a1, .LBB16_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB16_1 Depth=1
+ ; LA64-NEXT:    andn $a5, $a4, $a0
+ ; LA64-NEXT:    or $a5, $a5, $a2
+ ; LA64-NEXT:    sc.w $a5, $a3, 0
+-; LA64-NEXT:    beqz $a5, .LBB12_1
+-; LA64-NEXT:    b .LBB12_4
+-; LA64-NEXT:  .LBB12_3:
++; LA64-NEXT:    beqz $a5, .LBB16_1
++; LA64-NEXT:    b .LBB16_4
++; LA64-NEXT:  .LBB16_3:
+ ; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  .LBB12_4:
++; LA64-NEXT:  .LBB16_4:
+ ; LA64-NEXT:    ret
+   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val monotonic monotonic
+   ret void
+@@ -390,19 +495,19 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
+ ; LA64-NEXT:    addi.w $a0, $a0, 0
+ ; LA64-NEXT:    addi.w $a2, $a2, 0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+-; LA64-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a4, $a3, 0
+ ; LA64-NEXT:    and $a5, $a4, $a0
+-; LA64-NEXT:    bne $a5, $a1, .LBB13_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB13_1 Depth=1
++; LA64-NEXT:    bne $a5, $a1, .LBB17_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB17_1 Depth=1
+ ; LA64-NEXT:    andn $a5, $a4, $a0
+ ; LA64-NEXT:    or $a5, $a5, $a2
+ ; LA64-NEXT:    sc.w $a5, $a3, 0
+-; LA64-NEXT:    beqz $a5, .LBB13_1
+-; LA64-NEXT:    b .LBB13_4
+-; LA64-NEXT:  .LBB13_3:
++; LA64-NEXT:    beqz $a5, .LBB17_1
++; LA64-NEXT:    b .LBB17_4
++; LA64-NEXT:  .LBB17_3:
+ ; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  .LBB13_4:
++; LA64-NEXT:  .LBB17_4:
+ ; LA64-NEXT:    ret
+   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val monotonic monotonic
+   ret void
+@@ -411,17 +516,17 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
+ define void @cmpxchg_i32_monotonic_monotonic(ptr %ptr, i32 %cmp, i32 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i32_monotonic_monotonic:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a3, $a0, 0
+-; LA64-NEXT:    bne $a3, $a1, .LBB14_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB14_1 Depth=1
++; LA64-NEXT:    bne $a3, $a1, .LBB18_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB18_1 Depth=1
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.w $a4, $a0, 0
+-; LA64-NEXT:    beqz $a4, .LBB14_1
+-; LA64-NEXT:    b .LBB14_4
+-; LA64-NEXT:  .LBB14_3:
++; LA64-NEXT:    beqz $a4, .LBB18_1
++; LA64-NEXT:    b .LBB18_4
++; LA64-NEXT:  .LBB18_3:
+ ; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  .LBB14_4:
++; LA64-NEXT:  .LBB18_4:
+ ; LA64-NEXT:    ret
+   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val monotonic monotonic
+   ret void
+@@ -430,17 +535,17 @@ define void @cmpxchg_i32_monotonic_monotonic(ptr %ptr, i32 %cmp, i32 %val) nounw
+ define void @cmpxchg_i64_monotonic_monotonic(ptr %ptr, i64 %cmp, i64 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i64_monotonic_monotonic:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.d $a3, $a0, 0
+-; LA64-NEXT:    bne $a3, $a1, .LBB15_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB15_1 Depth=1
++; LA64-NEXT:    bne $a3, $a1, .LBB19_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB19_1 Depth=1
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.d $a4, $a0, 0
+-; LA64-NEXT:    beqz $a4, .LBB15_1
+-; LA64-NEXT:    b .LBB15_4
+-; LA64-NEXT:  .LBB15_3:
++; LA64-NEXT:    beqz $a4, .LBB19_1
++; LA64-NEXT:    b .LBB19_4
++; LA64-NEXT:  .LBB19_3:
+ ; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  .LBB15_4:
++; LA64-NEXT:  .LBB19_4:
+ ; LA64-NEXT:    ret
+   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val monotonic monotonic
+   ret void
+@@ -461,19 +566,19 @@ define i8 @cmpxchg_i8_monotonic_monotonic_reti8(ptr %ptr, i8 %cmp, i8 %val) noun
+ ; LA64-NEXT:    andi $a1, $a1, 255
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+-; LA64-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a5, $a3, 0
+ ; LA64-NEXT:    and $a6, $a5, $a4
+-; LA64-NEXT:    bne $a6, $a1, .LBB16_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB16_1 Depth=1
++; LA64-NEXT:    bne $a6, $a1, .LBB20_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB20_1 Depth=1
+ ; LA64-NEXT:    andn $a6, $a5, $a4
+ ; LA64-NEXT:    or $a6, $a6, $a2
+ ; LA64-NEXT:    sc.w $a6, $a3, 0
+-; LA64-NEXT:    beqz $a6, .LBB16_1
+-; LA64-NEXT:    b .LBB16_4
+-; LA64-NEXT:  .LBB16_3:
++; LA64-NEXT:    beqz $a6, .LBB20_1
++; LA64-NEXT:    b .LBB20_4
++; LA64-NEXT:  .LBB20_3:
+ ; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  .LBB16_4:
++; LA64-NEXT:  .LBB20_4:
+ ; LA64-NEXT:    srl.w $a0, $a5, $a0
+ ; LA64-NEXT:    ret
+   %tmp = cmpxchg ptr %ptr, i8 %cmp, i8 %val monotonic monotonic
+@@ -497,19 +602,19 @@ define i16 @cmpxchg_i16_monotonic_monotonic_reti16(ptr %ptr, i16 %cmp, i16 %val)
+ ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
+ ; LA64-NEXT:    sll.w $a1, $a1, $a0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+-; LA64-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a5, $a3, 0
+ ; LA64-NEXT:    and $a6, $a5, $a4
+-; LA64-NEXT:    bne $a6, $a1, .LBB17_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB17_1 Depth=1
++; LA64-NEXT:    bne $a6, $a1, .LBB21_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB21_1 Depth=1
+ ; LA64-NEXT:    andn $a6, $a5, $a4
+ ; LA64-NEXT:    or $a6, $a6, $a2
+ ; LA64-NEXT:    sc.w $a6, $a3, 0
+-; LA64-NEXT:    beqz $a6, .LBB17_1
+-; LA64-NEXT:    b .LBB17_4
+-; LA64-NEXT:  .LBB17_3:
++; LA64-NEXT:    beqz $a6, .LBB21_1
++; LA64-NEXT:    b .LBB21_4
++; LA64-NEXT:  .LBB21_3:
+ ; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  .LBB17_4:
++; LA64-NEXT:  .LBB21_4:
+ ; LA64-NEXT:    srl.w $a0, $a5, $a0
+ ; LA64-NEXT:    ret
+   %tmp = cmpxchg ptr %ptr, i16 %cmp, i16 %val monotonic monotonic
+@@ -520,17 +625,17 @@ define i16 @cmpxchg_i16_monotonic_monotonic_reti16(ptr %ptr, i16 %cmp, i16 %val)
+ define i32 @cmpxchg_i32_monotonic_monotonic_reti32(ptr %ptr, i32 %cmp, i32 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i32_monotonic_monotonic_reti32:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a3, $a0, 0
+-; LA64-NEXT:    bne $a3, $a1, .LBB18_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB18_1 Depth=1
++; LA64-NEXT:    bne $a3, $a1, .LBB22_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB22_1 Depth=1
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.w $a4, $a0, 0
+-; LA64-NEXT:    beqz $a4, .LBB18_1
+-; LA64-NEXT:    b .LBB18_4
+-; LA64-NEXT:  .LBB18_3:
++; LA64-NEXT:    beqz $a4, .LBB22_1
++; LA64-NEXT:    b .LBB22_4
++; LA64-NEXT:  .LBB22_3:
+ ; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  .LBB18_4:
++; LA64-NEXT:  .LBB22_4:
+ ; LA64-NEXT:    move $a0, $a3
+ ; LA64-NEXT:    ret
+   %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val monotonic monotonic
+@@ -541,17 +646,17 @@ define i32 @cmpxchg_i32_monotonic_monotonic_reti32(ptr %ptr, i32 %cmp, i32 %val)
+ define i64 @cmpxchg_i64_monotonic_monotonic_reti64(ptr %ptr, i64 %cmp, i64 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i64_monotonic_monotonic_reti64:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.d $a3, $a0, 0
+-; LA64-NEXT:    bne $a3, $a1, .LBB19_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB19_1 Depth=1
++; LA64-NEXT:    bne $a3, $a1, .LBB23_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB23_1 Depth=1
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.d $a4, $a0, 0
+-; LA64-NEXT:    beqz $a4, .LBB19_1
+-; LA64-NEXT:    b .LBB19_4
+-; LA64-NEXT:  .LBB19_3:
++; LA64-NEXT:    beqz $a4, .LBB23_1
++; LA64-NEXT:    b .LBB23_4
++; LA64-NEXT:  .LBB23_3:
+ ; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  .LBB19_4:
++; LA64-NEXT:  .LBB23_4:
+ ; LA64-NEXT:    move $a0, $a3
+ ; LA64-NEXT:    ret
+   %tmp = cmpxchg ptr %ptr, i64 %cmp, i64 %val monotonic monotonic
+@@ -574,19 +679,19 @@ define i1 @cmpxchg_i8_monotonic_monotonic_reti1(ptr %ptr, i8 %cmp, i8 %val) noun
+ ; LA64-NEXT:    addi.w $a0, $a0, 0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:    addi.w $a2, $a4, 0
+-; LA64-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB24_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a5, $a3, 0
+ ; LA64-NEXT:    and $a6, $a5, $a2
+-; LA64-NEXT:    bne $a6, $a1, .LBB20_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB20_1 Depth=1
++; LA64-NEXT:    bne $a6, $a1, .LBB24_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB24_1 Depth=1
+ ; LA64-NEXT:    andn $a6, $a5, $a2
+ ; LA64-NEXT:    or $a6, $a6, $a0
+ ; LA64-NEXT:    sc.w $a6, $a3, 0
+-; LA64-NEXT:    beqz $a6, .LBB20_1
+-; LA64-NEXT:    b .LBB20_4
+-; LA64-NEXT:  .LBB20_3:
++; LA64-NEXT:    beqz $a6, .LBB24_1
++; LA64-NEXT:    b .LBB24_4
++; LA64-NEXT:  .LBB24_3:
+ ; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  .LBB20_4:
++; LA64-NEXT:  .LBB24_4:
+ ; LA64-NEXT:    and $a0, $a5, $a4
+ ; LA64-NEXT:    addi.w $a0, $a0, 0
+ ; LA64-NEXT:    xor $a0, $a1, $a0
+@@ -613,19 +718,19 @@ define i1 @cmpxchg_i16_monotonic_monotonic_reti1(ptr %ptr, i16 %cmp, i16 %val) n
+ ; LA64-NEXT:    addi.w $a0, $a0, 0
+ ; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:    addi.w $a2, $a4, 0
+-; LA64-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB25_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a5, $a3, 0
+ ; LA64-NEXT:    and $a6, $a5, $a2
+-; LA64-NEXT:    bne $a6, $a1, .LBB21_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB21_1 Depth=1
++; LA64-NEXT:    bne $a6, $a1, .LBB25_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB25_1 Depth=1
+ ; LA64-NEXT:    andn $a6, $a5, $a2
+ ; LA64-NEXT:    or $a6, $a6, $a0
+ ; LA64-NEXT:    sc.w $a6, $a3, 0
+-; LA64-NEXT:    beqz $a6, .LBB21_1
+-; LA64-NEXT:    b .LBB21_4
+-; LA64-NEXT:  .LBB21_3:
++; LA64-NEXT:    beqz $a6, .LBB25_1
++; LA64-NEXT:    b .LBB25_4
++; LA64-NEXT:  .LBB25_3:
+ ; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  .LBB21_4:
++; LA64-NEXT:  .LBB25_4:
+ ; LA64-NEXT:    and $a0, $a5, $a4
+ ; LA64-NEXT:    addi.w $a0, $a0, 0
+ ; LA64-NEXT:    xor $a0, $a1, $a0
+@@ -639,17 +744,17 @@ define i1 @cmpxchg_i16_monotonic_monotonic_reti1(ptr %ptr, i16 %cmp, i16 %val) n
+ define i1 @cmpxchg_i32_monotonic_monotonic_reti1(ptr %ptr, i32 %cmp, i32 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i32_monotonic_monotonic_reti1:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB26_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a3, $a0, 0
+-; LA64-NEXT:    bne $a3, $a1, .LBB22_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB22_1 Depth=1
++; LA64-NEXT:    bne $a3, $a1, .LBB26_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB26_1 Depth=1
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.w $a4, $a0, 0
+-; LA64-NEXT:    beqz $a4, .LBB22_1
+-; LA64-NEXT:    b .LBB22_4
+-; LA64-NEXT:  .LBB22_3:
++; LA64-NEXT:    beqz $a4, .LBB26_1
++; LA64-NEXT:    b .LBB26_4
++; LA64-NEXT:  .LBB26_3:
+ ; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  .LBB22_4:
++; LA64-NEXT:  .LBB26_4:
+ ; LA64-NEXT:    addi.w $a0, $a1, 0
+ ; LA64-NEXT:    xor $a0, $a3, $a0
+ ; LA64-NEXT:    sltui $a0, $a0, 1
+@@ -662,17 +767,17 @@ define i1 @cmpxchg_i32_monotonic_monotonic_reti1(ptr %ptr, i32 %cmp, i32 %val) n
+ define i1 @cmpxchg_i64_monotonic_monotonic_reti1(ptr %ptr, i64 %cmp, i64 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i64_monotonic_monotonic_reti1:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
++; LA64-NEXT:  .LBB27_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.d $a3, $a0, 0
+-; LA64-NEXT:    bne $a3, $a1, .LBB23_3
+-; LA64-NEXT:  # %bb.2: # in Loop: Header=BB23_1 Depth=1
++; LA64-NEXT:    bne $a3, $a1, .LBB27_3
++; LA64-NEXT:  # %bb.2: # in Loop: Header=BB27_1 Depth=1
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.d $a4, $a0, 0
+-; LA64-NEXT:    beqz $a4, .LBB23_1
+-; LA64-NEXT:    b .LBB23_4
+-; LA64-NEXT:  .LBB23_3:
++; LA64-NEXT:    beqz $a4, .LBB27_1
++; LA64-NEXT:    b .LBB27_4
++; LA64-NEXT:  .LBB27_3:
+ ; LA64-NEXT:    dbar 1792
+-; LA64-NEXT:  .LBB23_4:
++; LA64-NEXT:  .LBB27_4:
+ ; LA64-NEXT:    xor $a0, $a3, $a1
+ ; LA64-NEXT:    sltui $a0, $a0, 1
+ ; LA64-NEXT:    ret
+-- 
+2.20.1
+
+
+From 331674f3553b747d9869276ae34667dce7099a09 Mon Sep 17 00:00:00 2001
+From: Lu Weining <90239436+SixWeining@users.noreply.github.com>
+Date: Thu, 19 Oct 2023 09:21:51 +0800
+Subject: [PATCH 5/7] [LoongArch] Improve codegen for atomic cmpxchg ops
+ (#69339)
+
+PR #67391 improved atomic codegen by handling memory ordering specified
+by the `cmpxchg` instruction. An acquire barrier needs to be generated
+when memory ordering includes an acquire operation. This PR improves the
+codegen further by only handling the failure ordering.
+
+(cherry picked from commit 78abc45c44cdadf76b30e1f3dc24936bb5627d68)
+---
+ .../LoongArchExpandAtomicPseudoInsts.cpp      |  4 +-
+ .../LoongArch/LoongArchISelLowering.cpp       |  7 ++-
+ .../Target/LoongArch/LoongArchInstrInfo.td    | 55 ++++++++++++++++---
+ .../ir-instruction/atomic-cmpxchg.ll          |  8 +--
+ 4 files changed, 56 insertions(+), 18 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp
+index b348cb56c136..18a532b55ee5 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp
+@@ -571,11 +571,11 @@ bool LoongArchExpandAtomicPseudo::expandAtomicCmpXchg(
+     BuildMI(LoopTailMBB, DL, TII->get(LoongArch::B)).addMBB(DoneMBB);
+   }
+ 
+-  AtomicOrdering Ordering =
++  AtomicOrdering FailureOrdering =
+       static_cast<AtomicOrdering>(MI.getOperand(IsMasked ? 6 : 5).getImm());
+   int hint;
+ 
+-  switch (Ordering) {
++  switch (FailureOrdering) {
+   case AtomicOrdering::Acquire:
+   case AtomicOrdering::AcquireRelease:
+   case AtomicOrdering::SequentiallyConsistent:
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index 33a3197013cc..99328f09921f 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -4492,8 +4492,9 @@ LoongArchTargetLowering::shouldExpandAtomicCmpXchgInIR(
+ Value *LoongArchTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
+     IRBuilderBase &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,
+     Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const {
+-  Value *Ordering =
+-      Builder.getIntN(Subtarget.getGRLen(), static_cast<uint64_t>(Ord));
++  AtomicOrdering FailOrd = CI->getFailureOrdering();
++  Value *FailureOrdering =
++      Builder.getIntN(Subtarget.getGRLen(), static_cast<uint64_t>(FailOrd));
+ 
+   // TODO: Support cmpxchg on LA32.
+   Intrinsic::ID CmpXchgIntrID = Intrinsic::loongarch_masked_cmpxchg_i64;
+@@ -4504,7 +4505,7 @@ Value *LoongArchTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
+   Function *MaskedCmpXchg =
+       Intrinsic::getDeclaration(CI->getModule(), CmpXchgIntrID, Tys);
+   Value *Result = Builder.CreateCall(
+-      MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, Ordering});
++      MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, FailureOrdering});
+   Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
+   return Result;
+ }
+diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+index fcbd314507a5..ab1890556814 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+@@ -1753,7 +1753,7 @@ def PseudoMaskedAtomicLoadMin32 : PseudoMaskedAMMinMax;
+ 
+ class PseudoCmpXchg
+     : Pseudo<(outs GPR:$res, GPR:$scratch),
+-             (ins GPR:$addr, GPR:$cmpval, GPR:$newval, grlenimm:$ordering)> {
++             (ins GPR:$addr, GPR:$cmpval, GPR:$newval, grlenimm:$fail_order)> {
+   let Constraints = "@earlyclobber $res,@earlyclobber $scratch";
+   let mayLoad = 1;
+   let mayStore = 1;
+@@ -1767,7 +1767,7 @@ def PseudoCmpXchg64 : PseudoCmpXchg;
+ def PseudoMaskedCmpXchg32
+     : Pseudo<(outs GPR:$res, GPR:$scratch),
+              (ins GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask,
+-              grlenimm:$ordering)> {
++              grlenimm:$fail_order)> {
+   let Constraints = "@earlyclobber $res,@earlyclobber $scratch";
+   let mayLoad = 1;
+   let mayStore = 1;
+@@ -1785,6 +1785,43 @@ class AtomicPat<Intrinsic intrin, Pseudo AMInst>
+     : Pat<(intrin GPR:$addr, GPR:$incr, GPR:$mask, timm:$ordering),
+           (AMInst GPR:$addr, GPR:$incr, GPR:$mask, timm:$ordering)>;
+ 
++// These atomic cmpxchg PatFrags only care about the failure ordering.
++// The PatFrags defined by multiclass `ternary_atomic_op_ord` in
++// TargetSelectionDAG.td care about the merged memory ordering that is the
++// stronger one between success and failure. But for LoongArch LL-SC we only
++// need to care about the failure ordering as explained in PR #67391. So we
++// define these PatFrags that will be used to define cmpxchg pats below.
++multiclass ternary_atomic_op_failure_ord {
++  def NAME#_failure_monotonic : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
++      (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
++    AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getFailureOrdering();
++    return Ordering == AtomicOrdering::Monotonic;
++  }]>;
++  def NAME#_failure_acquire : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
++      (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
++    AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getFailureOrdering();
++    return Ordering == AtomicOrdering::Acquire;
++  }]>;
++  def NAME#_failure_release : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
++      (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
++    AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getFailureOrdering();
++    return Ordering == AtomicOrdering::Release;
++  }]>;
++  def NAME#_failure_acq_rel : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
++      (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
++    AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getFailureOrdering();
++    return Ordering == AtomicOrdering::AcquireRelease;
++  }]>;
++  def NAME#_failure_seq_cst : PatFrag<(ops node:$ptr, node:$cmp, node:$val),
++      (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
++    AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getFailureOrdering();
++    return Ordering == AtomicOrdering::SequentiallyConsistent;
++  }]>;
++}
++
++defm atomic_cmp_swap_32 : ternary_atomic_op_failure_ord;
++defm atomic_cmp_swap_64 : ternary_atomic_op_failure_ord;
++
+ let Predicates = [IsLA64] in {
+ def : AtomicPat<int_loongarch_masked_atomicrmw_xchg_i64,
+                 PseudoMaskedAtomicSwap32>;
+@@ -1847,24 +1884,24 @@ def : AtomicPat<int_loongarch_masked_atomicrmw_umin_i64,
+ // AtomicOrdering.h.
+ multiclass PseudoCmpXchgPat<string Op, Pseudo CmpXchgInst,
+                             ValueType vt = GRLenVT> {
+-  def : Pat<(vt (!cast<PatFrag>(Op#"_monotonic") GPR:$addr, GPR:$cmp, GPR:$new)),
++  def : Pat<(vt (!cast<PatFrag>(Op#"_failure_monotonic") GPR:$addr, GPR:$cmp, GPR:$new)),
+             (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 2)>;
+-  def : Pat<(vt (!cast<PatFrag>(Op#"_acquire") GPR:$addr, GPR:$cmp, GPR:$new)),
++  def : Pat<(vt (!cast<PatFrag>(Op#"_failure_acquire") GPR:$addr, GPR:$cmp, GPR:$new)),
+             (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 4)>;
+-  def : Pat<(vt (!cast<PatFrag>(Op#"_release") GPR:$addr, GPR:$cmp, GPR:$new)),
++  def : Pat<(vt (!cast<PatFrag>(Op#"_failure_release") GPR:$addr, GPR:$cmp, GPR:$new)),
+             (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 5)>;
+-  def : Pat<(vt (!cast<PatFrag>(Op#"_acq_rel") GPR:$addr, GPR:$cmp, GPR:$new)),
++  def : Pat<(vt (!cast<PatFrag>(Op#"_failure_acq_rel") GPR:$addr, GPR:$cmp, GPR:$new)),
+             (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 6)>;
+-  def : Pat<(vt (!cast<PatFrag>(Op#"_seq_cst") GPR:$addr, GPR:$cmp, GPR:$new)),
++  def : Pat<(vt (!cast<PatFrag>(Op#"_failure_seq_cst") GPR:$addr, GPR:$cmp, GPR:$new)),
+             (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 7)>;
+ }
+ 
+ defm : PseudoCmpXchgPat<"atomic_cmp_swap_32", PseudoCmpXchg32>;
+ defm : PseudoCmpXchgPat<"atomic_cmp_swap_64", PseudoCmpXchg64, i64>;
+ def : Pat<(int_loongarch_masked_cmpxchg_i64
+-            GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering),
++            GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$fail_order),
+           (PseudoMaskedCmpXchg32
+-            GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering)>;
++            GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$fail_order)>;
+ 
+ def : PseudoMaskedAMMinMaxPat<int_loongarch_masked_atomicrmw_max_i64,
+                               PseudoMaskedAtomicLoadMax32>;
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
+index 174bb9d0ff7d..1dd3f39852d8 100644
+--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
+@@ -132,7 +132,7 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
+ ; LA64-NEXT:    beqz $a5, .LBB4_1
+ ; LA64-NEXT:    b .LBB4_4
+ ; LA64-NEXT:  .LBB4_3:
+-; LA64-NEXT:    dbar 20
++; LA64-NEXT:    dbar 1792
+ ; LA64-NEXT:  .LBB4_4:
+ ; LA64-NEXT:    ret
+   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val acquire monotonic
+@@ -166,7 +166,7 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
+ ; LA64-NEXT:    beqz $a5, .LBB5_1
+ ; LA64-NEXT:    b .LBB5_4
+ ; LA64-NEXT:  .LBB5_3:
+-; LA64-NEXT:    dbar 20
++; LA64-NEXT:    dbar 1792
+ ; LA64-NEXT:  .LBB5_4:
+ ; LA64-NEXT:    ret
+   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val acquire monotonic
+@@ -185,7 +185,7 @@ define void @cmpxchg_i32_acquire_monotonic(ptr %ptr, i32 %cmp, i32 %val) nounwin
+ ; LA64-NEXT:    beqz $a4, .LBB6_1
+ ; LA64-NEXT:    b .LBB6_4
+ ; LA64-NEXT:  .LBB6_3:
+-; LA64-NEXT:    dbar 20
++; LA64-NEXT:    dbar 1792
+ ; LA64-NEXT:  .LBB6_4:
+ ; LA64-NEXT:    ret
+   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire monotonic
+@@ -204,7 +204,7 @@ define void @cmpxchg_i64_acquire_monotonic(ptr %ptr, i64 %cmp, i64 %val) nounwin
+ ; LA64-NEXT:    beqz $a4, .LBB7_1
+ ; LA64-NEXT:    b .LBB7_4
+ ; LA64-NEXT:  .LBB7_3:
+-; LA64-NEXT:    dbar 20
++; LA64-NEXT:    dbar 1792
+ ; LA64-NEXT:  .LBB7_4:
+ ; LA64-NEXT:    ret
+   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val acquire monotonic
+-- 
+2.20.1
+
+
+From d8479f9765b12a84d1756aedebf631fdbe4f0451 Mon Sep 17 00:00:00 2001
+From: Lu Weining <luweining@loongson.cn>
+Date: Mon, 4 Mar 2024 08:38:52 +0800
+Subject: [PATCH 6/7] [LoongArch] Override
+ LoongArchTargetLowering::getExtendForAtomicCmpSwapArg (#83656)
+
+This patch aims to solve Firefox issue:
+https://bugzilla.mozilla.org/show_bug.cgi?id=1882301
+
+Similar to 616289ed2922. Currently LoongArch uses an ll.[wd]/sc.[wd]
+loop for ATOMIC_CMP_XCHG. Because the comparison in the loop is
+full-width (i.e. the `bne` instruction), we must sign extend the input
+comparsion argument.
+
+Note that LoongArch ISA manual V1.1 has introduced compare-and-swap
+instructions. We would change the implementation (return `ANY_EXTEND`)
+when we support them.
+
+(cherry picked from commit 5f058aa211995d2f0df2a0e063532832569cb7a8)
+(cherry picked from commit ea6c457b8dd2d0e6a7f05b4a5bdd2686085e1ec0)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       |   5 +
+ .../Target/LoongArch/LoongArchISelLowering.h  |   2 +
+ .../LoongArch/atomicrmw-uinc-udec-wrap.ll     | 120 +++++++------
+ .../ir-instruction/atomic-cmpxchg.ll          |  25 +--
+ .../LoongArch/ir-instruction/atomicrmw-fp.ll  | 160 +++++++++---------
+ 5 files changed, 159 insertions(+), 153 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index 99328f09921f..4fc2b4709840 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -4893,3 +4893,8 @@ bool LoongArchTargetLowering::hasAndNotCompare(SDValue Y) const {
+ 
+   return !isa<ConstantSDNode>(Y);
+ }
++
++ISD::NodeType LoongArchTargetLowering::getExtendForAtomicCmpSwapArg() const {
++  // TODO: LAMCAS will use amcas{_DB,}.[bhwd] which does not require extension.
++  return ISD::SIGN_EXTEND;
++}
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+index 23b90640a690..2c9826a13237 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+@@ -203,6 +203,8 @@ public:
+     return ISD::SIGN_EXTEND;
+   }
+ 
++  ISD::NodeType getExtendForAtomicCmpSwapArg() const override;
++
+   Register getRegisterByName(const char *RegName, LLT VT,
+                              const MachineFunction &MF) const override;
+   bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
+diff --git a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
+index d8908acbc945..f0baf19bcf0e 100644
+--- a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
++++ b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
+@@ -26,15 +26,16 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
+ ; LA64-NEXT:    andi $a5, $a5, 255
+ ; LA64-NEXT:    sll.w $a5, $a5, $a0
+ ; LA64-NEXT:    and $a6, $a3, $a4
+-; LA64-NEXT:    or $a6, $a6, $a5
++; LA64-NEXT:    or $a5, $a6, $a5
++; LA64-NEXT:    addi.w $a6, $a3, 0
+ ; LA64-NEXT:  .LBB0_3: # %atomicrmw.start
+ ; LA64-NEXT:    # Parent Loop BB0_1 Depth=1
+ ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
+-; LA64-NEXT:    ll.w $a5, $a2, 0
+-; LA64-NEXT:    bne $a5, $a3, .LBB0_5
++; LA64-NEXT:    ll.w $a3, $a2, 0
++; LA64-NEXT:    bne $a3, $a6, .LBB0_5
+ ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB0_3 Depth=2
+-; LA64-NEXT:    move $a7, $a6
++; LA64-NEXT:    move $a7, $a5
+ ; LA64-NEXT:    sc.w $a7, $a2, 0
+ ; LA64-NEXT:    beqz $a7, .LBB0_3
+ ; LA64-NEXT:    b .LBB0_6
+@@ -43,11 +44,9 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
+ ; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB0_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB0_1 Depth=1
+-; LA64-NEXT:    addi.w $a6, $a3, 0
+-; LA64-NEXT:    move $a3, $a5
+-; LA64-NEXT:    bne $a5, $a6, .LBB0_1
++; LA64-NEXT:    bne $a3, $a6, .LBB0_1
+ ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
+-; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    srl.w $a0, $a3, $a0
+ ; LA64-NEXT:    ret
+   %result = atomicrmw uinc_wrap ptr %ptr, i8 %val seq_cst
+   ret i8 %result
+@@ -79,15 +78,16 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
+ ; LA64-NEXT:    bstrpick.d $a5, $a5, 15, 0
+ ; LA64-NEXT:    sll.w $a5, $a5, $a0
+ ; LA64-NEXT:    and $a6, $a3, $a4
+-; LA64-NEXT:    or $a6, $a6, $a5
++; LA64-NEXT:    or $a5, $a6, $a5
++; LA64-NEXT:    addi.w $a6, $a3, 0
+ ; LA64-NEXT:  .LBB1_3: # %atomicrmw.start
+ ; LA64-NEXT:    # Parent Loop BB1_1 Depth=1
+ ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
+-; LA64-NEXT:    ll.w $a5, $a2, 0
+-; LA64-NEXT:    bne $a5, $a3, .LBB1_5
++; LA64-NEXT:    ll.w $a3, $a2, 0
++; LA64-NEXT:    bne $a3, $a6, .LBB1_5
+ ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB1_3 Depth=2
+-; LA64-NEXT:    move $a7, $a6
++; LA64-NEXT:    move $a7, $a5
+ ; LA64-NEXT:    sc.w $a7, $a2, 0
+ ; LA64-NEXT:    beqz $a7, .LBB1_3
+ ; LA64-NEXT:    b .LBB1_6
+@@ -96,11 +96,9 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
+ ; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB1_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB1_1 Depth=1
+-; LA64-NEXT:    addi.w $a6, $a3, 0
+-; LA64-NEXT:    move $a3, $a5
+-; LA64-NEXT:    bne $a5, $a6, .LBB1_1
++; LA64-NEXT:    bne $a3, $a6, .LBB1_1
+ ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
+-; LA64-NEXT:    srl.w $a0, $a5, $a0
++; LA64-NEXT:    srl.w $a0, $a3, $a0
+ ; LA64-NEXT:    ret
+   %result = atomicrmw uinc_wrap ptr %ptr, i16 %val seq_cst
+   ret i16 %result
+@@ -109,37 +107,36 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
+ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
+ ; LA64-LABEL: atomicrmw_uinc_wrap_i32:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:    ld.w $a3, $a0, 0
+-; LA64-NEXT:    addi.w $a2, $a1, 0
++; LA64-NEXT:    ld.w $a2, $a0, 0
++; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:    .p2align 4, , 16
+ ; LA64-NEXT:  .LBB2_1: # %atomicrmw.start
+ ; LA64-NEXT:    # =>This Loop Header: Depth=1
+ ; LA64-NEXT:    # Child Loop BB2_3 Depth 2
+-; LA64-NEXT:    addi.w $a4, $a3, 0
+-; LA64-NEXT:    sltu $a1, $a4, $a2
+-; LA64-NEXT:    xori $a1, $a1, 1
+-; LA64-NEXT:    addi.d $a5, $a3, 1
+-; LA64-NEXT:    masknez $a5, $a5, $a1
++; LA64-NEXT:    addi.w $a3, $a2, 0
++; LA64-NEXT:    sltu $a4, $a3, $a1
++; LA64-NEXT:    xori $a4, $a4, 1
++; LA64-NEXT:    addi.d $a2, $a2, 1
++; LA64-NEXT:    masknez $a4, $a2, $a4
+ ; LA64-NEXT:  .LBB2_3: # %atomicrmw.start
+ ; LA64-NEXT:    # Parent Loop BB2_1 Depth=1
+ ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
+-; LA64-NEXT:    ll.w $a1, $a0, 0
+-; LA64-NEXT:    bne $a1, $a3, .LBB2_5
++; LA64-NEXT:    ll.w $a2, $a0, 0
++; LA64-NEXT:    bne $a2, $a3, .LBB2_5
+ ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB2_3 Depth=2
+-; LA64-NEXT:    move $a6, $a5
+-; LA64-NEXT:    sc.w $a6, $a0, 0
+-; LA64-NEXT:    beqz $a6, .LBB2_3
++; LA64-NEXT:    move $a5, $a4
++; LA64-NEXT:    sc.w $a5, $a0, 0
++; LA64-NEXT:    beqz $a5, .LBB2_3
+ ; LA64-NEXT:    b .LBB2_6
+ ; LA64-NEXT:  .LBB2_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB2_1 Depth=1
+ ; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB2_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB2_1 Depth=1
+-; LA64-NEXT:    move $a3, $a1
+-; LA64-NEXT:    bne $a1, $a4, .LBB2_1
++; LA64-NEXT:    bne $a2, $a3, .LBB2_1
+ ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
+-; LA64-NEXT:    move $a0, $a1
++; LA64-NEXT:    move $a0, $a2
+ ; LA64-NEXT:    ret
+   %result = atomicrmw uinc_wrap ptr %ptr, i32 %val seq_cst
+   ret i32 %result
+@@ -212,15 +209,16 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
+ ; LA64-NEXT:    andi $a6, $a6, 255
+ ; LA64-NEXT:    sll.w $a6, $a6, $a0
+ ; LA64-NEXT:    and $a7, $a3, $a4
+-; LA64-NEXT:    or $a7, $a7, $a6
++; LA64-NEXT:    or $a6, $a7, $a6
++; LA64-NEXT:    addi.w $a7, $a3, 0
+ ; LA64-NEXT:  .LBB4_3: # %atomicrmw.start
+ ; LA64-NEXT:    # Parent Loop BB4_1 Depth=1
+ ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
+-; LA64-NEXT:    ll.w $a6, $a2, 0
+-; LA64-NEXT:    bne $a6, $a3, .LBB4_5
++; LA64-NEXT:    ll.w $a3, $a2, 0
++; LA64-NEXT:    bne $a3, $a7, .LBB4_5
+ ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB4_3 Depth=2
+-; LA64-NEXT:    move $t0, $a7
++; LA64-NEXT:    move $t0, $a6
+ ; LA64-NEXT:    sc.w $t0, $a2, 0
+ ; LA64-NEXT:    beqz $t0, .LBB4_3
+ ; LA64-NEXT:    b .LBB4_6
+@@ -229,11 +227,9 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
+ ; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB4_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB4_1 Depth=1
+-; LA64-NEXT:    addi.w $a7, $a3, 0
+-; LA64-NEXT:    move $a3, $a6
+-; LA64-NEXT:    bne $a6, $a7, .LBB4_1
++; LA64-NEXT:    bne $a3, $a7, .LBB4_1
+ ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
+-; LA64-NEXT:    srl.w $a0, $a6, $a0
++; LA64-NEXT:    srl.w $a0, $a3, $a0
+ ; LA64-NEXT:    ret
+   %result = atomicrmw udec_wrap ptr %ptr, i8 %val seq_cst
+   ret i8 %result
+@@ -270,15 +266,16 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
+ ; LA64-NEXT:    bstrpick.d $a6, $a6, 15, 0
+ ; LA64-NEXT:    sll.w $a6, $a6, $a0
+ ; LA64-NEXT:    and $a7, $a3, $a4
+-; LA64-NEXT:    or $a7, $a7, $a6
++; LA64-NEXT:    or $a6, $a7, $a6
++; LA64-NEXT:    addi.w $a7, $a3, 0
+ ; LA64-NEXT:  .LBB5_3: # %atomicrmw.start
+ ; LA64-NEXT:    # Parent Loop BB5_1 Depth=1
+ ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
+-; LA64-NEXT:    ll.w $a6, $a2, 0
+-; LA64-NEXT:    bne $a6, $a3, .LBB5_5
++; LA64-NEXT:    ll.w $a3, $a2, 0
++; LA64-NEXT:    bne $a3, $a7, .LBB5_5
+ ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB5_3 Depth=2
+-; LA64-NEXT:    move $t0, $a7
++; LA64-NEXT:    move $t0, $a6
+ ; LA64-NEXT:    sc.w $t0, $a2, 0
+ ; LA64-NEXT:    beqz $t0, .LBB5_3
+ ; LA64-NEXT:    b .LBB5_6
+@@ -287,11 +284,9 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
+ ; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB5_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB5_1 Depth=1
+-; LA64-NEXT:    addi.w $a7, $a3, 0
+-; LA64-NEXT:    move $a3, $a6
+-; LA64-NEXT:    bne $a6, $a7, .LBB5_1
++; LA64-NEXT:    bne $a3, $a7, .LBB5_1
+ ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
+-; LA64-NEXT:    srl.w $a0, $a6, $a0
++; LA64-NEXT:    srl.w $a0, $a3, $a0
+ ; LA64-NEXT:    ret
+   %result = atomicrmw udec_wrap ptr %ptr, i16 %val seq_cst
+   ret i16 %result
+@@ -300,22 +295,22 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
+ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
+ ; LA64-LABEL: atomicrmw_udec_wrap_i32:
+ ; LA64:       # %bb.0:
+-; LA64-NEXT:    ld.w $a4, $a0, 0
++; LA64-NEXT:    ld.w $a2, $a0, 0
+ ; LA64-NEXT:    addi.w $a3, $a1, 0
+ ; LA64-NEXT:    .p2align 4, , 16
+ ; LA64-NEXT:  .LBB6_1: # %atomicrmw.start
+ ; LA64-NEXT:    # =>This Loop Header: Depth=1
+ ; LA64-NEXT:    # Child Loop BB6_3 Depth 2
+-; LA64-NEXT:    addi.w $a5, $a4, 0
+-; LA64-NEXT:    sltu $a2, $a3, $a5
+-; LA64-NEXT:    addi.d $a6, $a4, -1
+-; LA64-NEXT:    masknez $a6, $a6, $a2
+-; LA64-NEXT:    maskeqz $a2, $a1, $a2
+-; LA64-NEXT:    or $a2, $a2, $a6
+-; LA64-NEXT:    sltui $a6, $a5, 1
+-; LA64-NEXT:    masknez $a2, $a2, $a6
+-; LA64-NEXT:    maskeqz $a6, $a1, $a6
+-; LA64-NEXT:    or $a6, $a6, $a2
++; LA64-NEXT:    addi.w $a4, $a2, 0
++; LA64-NEXT:    sltu $a5, $a3, $a4
++; LA64-NEXT:    addi.d $a2, $a2, -1
++; LA64-NEXT:    masknez $a2, $a2, $a5
++; LA64-NEXT:    maskeqz $a5, $a1, $a5
++; LA64-NEXT:    or $a2, $a5, $a2
++; LA64-NEXT:    sltui $a5, $a4, 1
++; LA64-NEXT:    masknez $a2, $a2, $a5
++; LA64-NEXT:    maskeqz $a5, $a1, $a5
++; LA64-NEXT:    or $a5, $a5, $a2
+ ; LA64-NEXT:  .LBB6_3: # %atomicrmw.start
+ ; LA64-NEXT:    # Parent Loop BB6_1 Depth=1
+ ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -323,17 +318,16 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
+ ; LA64-NEXT:    bne $a2, $a4, .LBB6_5
+ ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB6_3 Depth=2
+-; LA64-NEXT:    move $a7, $a6
+-; LA64-NEXT:    sc.w $a7, $a0, 0
+-; LA64-NEXT:    beqz $a7, .LBB6_3
++; LA64-NEXT:    move $a6, $a5
++; LA64-NEXT:    sc.w $a6, $a0, 0
++; LA64-NEXT:    beqz $a6, .LBB6_3
+ ; LA64-NEXT:    b .LBB6_6
+ ; LA64-NEXT:  .LBB6_5: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+ ; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB6_6: # %atomicrmw.start
+ ; LA64-NEXT:    # in Loop: Header=BB6_1 Depth=1
+-; LA64-NEXT:    move $a4, $a2
+-; LA64-NEXT:    bne $a2, $a5, .LBB6_1
++; LA64-NEXT:    bne $a2, $a4, .LBB6_1
+ ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64-NEXT:    move $a0, $a2
+ ; LA64-NEXT:    ret
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
+index 1dd3f39852d8..ebb09640e6c9 100644
+--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
+@@ -71,6 +71,7 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
+ define void @cmpxchg_i32_acquire_acquire(ptr %ptr, i32 %cmp, i32 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i32_acquire_acquire:
+ ; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a3, $a0, 0
+ ; LA64-NEXT:    bne $a3, $a1, .LBB2_3
+@@ -176,6 +177,7 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
+ define void @cmpxchg_i32_acquire_monotonic(ptr %ptr, i32 %cmp, i32 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i32_acquire_monotonic:
+ ; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a3, $a0, 0
+ ; LA64-NEXT:    bne $a3, $a1, .LBB6_3
+@@ -285,9 +287,10 @@ define i16 @cmpxchg_i16_acquire_acquire_reti16(ptr %ptr, i16 %cmp, i16 %val) nou
+ define i32 @cmpxchg_i32_acquire_acquire_reti32(ptr %ptr, i32 %cmp, i32 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i32_acquire_acquire_reti32:
+ ; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a3, $a1, 0
+ ; LA64-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    ll.w $a3, $a0, 0
+-; LA64-NEXT:    bne $a3, $a1, .LBB10_3
++; LA64-NEXT:    ll.w $a1, $a0, 0
++; LA64-NEXT:    bne $a1, $a3, .LBB10_3
+ ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB10_1 Depth=1
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.w $a4, $a0, 0
+@@ -296,7 +299,7 @@ define i32 @cmpxchg_i32_acquire_acquire_reti32(ptr %ptr, i32 %cmp, i32 %val) nou
+ ; LA64-NEXT:  .LBB10_3:
+ ; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB10_4:
+-; LA64-NEXT:    move $a0, $a3
++; LA64-NEXT:    move $a0, $a1
+ ; LA64-NEXT:    ret
+   %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire acquire
+   %res = extractvalue { i32, i1 } %tmp, 0
+@@ -404,6 +407,7 @@ define i1 @cmpxchg_i16_acquire_acquire_reti1(ptr %ptr, i16 %cmp, i16 %val) nounw
+ define i1 @cmpxchg_i32_acquire_acquire_reti1(ptr %ptr, i32 %cmp, i32 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i32_acquire_acquire_reti1:
+ ; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a3, $a0, 0
+ ; LA64-NEXT:    bne $a3, $a1, .LBB14_3
+@@ -415,8 +419,7 @@ define i1 @cmpxchg_i32_acquire_acquire_reti1(ptr %ptr, i32 %cmp, i32 %val) nounw
+ ; LA64-NEXT:  .LBB14_3:
+ ; LA64-NEXT:    dbar 20
+ ; LA64-NEXT:  .LBB14_4:
+-; LA64-NEXT:    addi.w $a0, $a1, 0
+-; LA64-NEXT:    xor $a0, $a3, $a0
++; LA64-NEXT:    xor $a0, $a3, $a1
+ ; LA64-NEXT:    sltui $a0, $a0, 1
+ ; LA64-NEXT:    ret
+   %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire acquire
+@@ -516,6 +519,7 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
+ define void @cmpxchg_i32_monotonic_monotonic(ptr %ptr, i32 %cmp, i32 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i32_monotonic_monotonic:
+ ; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a3, $a0, 0
+ ; LA64-NEXT:    bne $a3, $a1, .LBB18_3
+@@ -625,9 +629,10 @@ define i16 @cmpxchg_i16_monotonic_monotonic_reti16(ptr %ptr, i16 %cmp, i16 %val)
+ define i32 @cmpxchg_i32_monotonic_monotonic_reti32(ptr %ptr, i32 %cmp, i32 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i32_monotonic_monotonic_reti32:
+ ; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a3, $a1, 0
+ ; LA64-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
+-; LA64-NEXT:    ll.w $a3, $a0, 0
+-; LA64-NEXT:    bne $a3, $a1, .LBB22_3
++; LA64-NEXT:    ll.w $a1, $a0, 0
++; LA64-NEXT:    bne $a1, $a3, .LBB22_3
+ ; LA64-NEXT:  # %bb.2: # in Loop: Header=BB22_1 Depth=1
+ ; LA64-NEXT:    move $a4, $a2
+ ; LA64-NEXT:    sc.w $a4, $a0, 0
+@@ -636,7 +641,7 @@ define i32 @cmpxchg_i32_monotonic_monotonic_reti32(ptr %ptr, i32 %cmp, i32 %val)
+ ; LA64-NEXT:  .LBB22_3:
+ ; LA64-NEXT:    dbar 1792
+ ; LA64-NEXT:  .LBB22_4:
+-; LA64-NEXT:    move $a0, $a3
++; LA64-NEXT:    move $a0, $a1
+ ; LA64-NEXT:    ret
+   %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val monotonic monotonic
+   %res = extractvalue { i32, i1 } %tmp, 0
+@@ -744,6 +749,7 @@ define i1 @cmpxchg_i16_monotonic_monotonic_reti1(ptr %ptr, i16 %cmp, i16 %val) n
+ define i1 @cmpxchg_i32_monotonic_monotonic_reti1(ptr %ptr, i32 %cmp, i32 %val) nounwind {
+ ; LA64-LABEL: cmpxchg_i32_monotonic_monotonic_reti1:
+ ; LA64:       # %bb.0:
++; LA64-NEXT:    addi.w $a1, $a1, 0
+ ; LA64-NEXT:  .LBB26_1: # =>This Inner Loop Header: Depth=1
+ ; LA64-NEXT:    ll.w $a3, $a0, 0
+ ; LA64-NEXT:    bne $a3, $a1, .LBB26_3
+@@ -755,8 +761,7 @@ define i1 @cmpxchg_i32_monotonic_monotonic_reti1(ptr %ptr, i32 %cmp, i32 %val) n
+ ; LA64-NEXT:  .LBB26_3:
+ ; LA64-NEXT:    dbar 1792
+ ; LA64-NEXT:  .LBB26_4:
+-; LA64-NEXT:    addi.w $a0, $a1, 0
+-; LA64-NEXT:    xor $a0, $a3, $a0
++; LA64-NEXT:    xor $a0, $a3, $a1
+ ; LA64-NEXT:    sltui $a0, $a0, 1
+ ; LA64-NEXT:    ret
+   %tmp = cmpxchg ptr %ptr, i32 %cmp, i32 %val monotonic monotonic
+diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+index 589360823b14..4d8160d70803 100644
+--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
++++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+@@ -16,6 +16,7 @@ define float @float_fadd_acquire(ptr %p) nounwind {
+ ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB0_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB0_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -33,8 +34,7 @@ define float @float_fadd_acquire(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB0_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB0_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB0_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB0_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -51,6 +51,7 @@ define float @float_fadd_acquire(ptr %p) nounwind {
+ ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB0_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB0_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -68,8 +69,7 @@ define float @float_fadd_acquire(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB0_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB0_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB0_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB0_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fadd ptr %p, float 1.0 acquire, align 4
+@@ -90,6 +90,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
+ ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB1_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB1_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -107,8 +108,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB1_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB1_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB1_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB1_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -125,6 +125,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
+ ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB1_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB1_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -142,8 +143,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB1_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB1_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB1_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB1_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fsub ptr %p, float 1.0 acquire, align 4
+@@ -165,6 +165,7 @@ define float @float_fmin_acquire(ptr %p) nounwind {
+ ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB2_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB2_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -182,8 +183,7 @@ define float @float_fmin_acquire(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB2_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB2_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB2_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB2_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -201,6 +201,7 @@ define float @float_fmin_acquire(ptr %p) nounwind {
+ ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB2_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB2_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -218,8 +219,7 @@ define float @float_fmin_acquire(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB2_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB2_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB2_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB2_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fmin ptr %p, float 1.0 acquire, align 4
+@@ -241,6 +241,7 @@ define float @float_fmax_acquire(ptr %p) nounwind {
+ ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB3_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB3_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -258,8 +259,7 @@ define float @float_fmax_acquire(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB3_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB3_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB3_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB3_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -277,6 +277,7 @@ define float @float_fmax_acquire(ptr %p) nounwind {
+ ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB3_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB3_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -294,8 +295,7 @@ define float @float_fmax_acquire(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB3_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB3_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB3_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB3_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fmax ptr %p, float 1.0 acquire, align 4
+@@ -694,6 +694,7 @@ define float @float_fadd_release(ptr %p) nounwind {
+ ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB8_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB8_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -711,8 +712,7 @@ define float @float_fadd_release(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB8_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB8_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB8_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB8_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -729,6 +729,7 @@ define float @float_fadd_release(ptr %p) nounwind {
+ ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB8_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB8_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -746,8 +747,7 @@ define float @float_fadd_release(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB8_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB8_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB8_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB8_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fadd ptr %p, float 1.0 release, align 4
+@@ -768,6 +768,7 @@ define float @float_fsub_release(ptr %p) nounwind {
+ ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB9_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB9_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -785,8 +786,7 @@ define float @float_fsub_release(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB9_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB9_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB9_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB9_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -803,6 +803,7 @@ define float @float_fsub_release(ptr %p) nounwind {
+ ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB9_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB9_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -820,8 +821,7 @@ define float @float_fsub_release(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB9_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB9_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB9_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB9_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fsub ptr %p, float 1.0 release, align 4
+@@ -843,6 +843,7 @@ define float @float_fmin_release(ptr %p) nounwind {
+ ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB10_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB10_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -860,8 +861,7 @@ define float @float_fmin_release(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB10_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB10_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB10_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB10_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -879,6 +879,7 @@ define float @float_fmin_release(ptr %p) nounwind {
+ ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB10_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB10_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -896,8 +897,7 @@ define float @float_fmin_release(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB10_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB10_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB10_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB10_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fmin ptr %p, float 1.0 release, align 4
+@@ -919,6 +919,7 @@ define float @float_fmax_release(ptr %p) nounwind {
+ ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB11_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB11_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -936,8 +937,7 @@ define float @float_fmax_release(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB11_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB11_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB11_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB11_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -955,6 +955,7 @@ define float @float_fmax_release(ptr %p) nounwind {
+ ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB11_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB11_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -972,8 +973,7 @@ define float @float_fmax_release(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB11_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB11_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB11_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB11_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fmax ptr %p, float 1.0 release, align 4
+@@ -1372,6 +1372,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind {
+ ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB16_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB16_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -1389,8 +1390,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB16_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB16_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB16_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB16_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -1407,6 +1407,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind {
+ ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB16_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB16_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -1424,8 +1425,7 @@ define float @float_fadd_acq_rel(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB16_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB16_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB16_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB16_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fadd ptr %p, float 1.0 acq_rel, align 4
+@@ -1446,6 +1446,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
+ ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB17_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB17_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -1463,8 +1464,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB17_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB17_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB17_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB17_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -1481,6 +1481,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
+ ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB17_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB17_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -1498,8 +1499,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB17_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB17_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB17_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB17_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fsub ptr %p, float 1.0 acq_rel, align 4
+@@ -1521,6 +1521,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind {
+ ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB18_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB18_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -1538,8 +1539,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB18_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB18_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB18_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB18_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -1557,6 +1557,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind {
+ ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB18_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB18_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -1574,8 +1575,7 @@ define float @float_fmin_acq_rel(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB18_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB18_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB18_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB18_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fmin ptr %p, float 1.0 acq_rel, align 4
+@@ -1597,6 +1597,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind {
+ ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB19_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB19_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -1614,8 +1615,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB19_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB19_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB19_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB19_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -1633,6 +1633,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind {
+ ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB19_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB19_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -1650,8 +1651,7 @@ define float @float_fmax_acq_rel(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB19_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB19_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB19_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB19_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fmax ptr %p, float 1.0 acq_rel, align 4
+@@ -2074,6 +2074,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind {
+ ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB24_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB24_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -2091,8 +2092,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB24_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB24_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB24_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB24_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -2109,6 +2109,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind {
+ ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB24_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB24_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -2126,8 +2127,7 @@ define float @float_fadd_seq_cst(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB24_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB24_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB24_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB24_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fadd ptr %p, float 1.0 seq_cst, align 4
+@@ -2148,6 +2148,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
+ ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB25_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB25_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -2165,8 +2166,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB25_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB25_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB25_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB25_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -2183,6 +2183,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
+ ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB25_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB25_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -2200,8 +2201,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB25_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB25_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB25_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB25_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fsub ptr %p, float 1.0 seq_cst, align 4
+@@ -2223,6 +2223,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind {
+ ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB26_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB26_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -2240,8 +2241,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB26_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB26_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB26_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB26_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -2259,6 +2259,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind {
+ ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB26_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB26_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -2276,8 +2277,7 @@ define float @float_fmin_seq_cst(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB26_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB26_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB26_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB26_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fmin ptr %p, float 1.0 seq_cst, align 4
+@@ -2299,6 +2299,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind {
+ ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB27_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB27_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -2316,8 +2317,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB27_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB27_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB27_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB27_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -2335,6 +2335,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind {
+ ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB27_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB27_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -2352,8 +2353,7 @@ define float @float_fmax_seq_cst(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB27_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB27_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB27_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB27_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fmax ptr %p, float 1.0 seq_cst, align 4
+@@ -2752,6 +2752,7 @@ define float @float_fadd_monotonic(ptr %p) nounwind {
+ ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB32_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB32_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -2769,8 +2770,7 @@ define float @float_fadd_monotonic(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB32_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB32_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB32_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB32_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -2787,6 +2787,7 @@ define float @float_fadd_monotonic(ptr %p) nounwind {
+ ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB32_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB32_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -2804,8 +2805,7 @@ define float @float_fadd_monotonic(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB32_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB32_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB32_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB32_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fadd ptr %p, float 1.0 monotonic, align 4
+@@ -2826,6 +2826,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
+ ; LA64F-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB33_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB33_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -2843,8 +2844,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB33_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB33_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB33_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB33_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -2861,6 +2861,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
+ ; LA64D-NEXT:    fadd.s $fa2, $fa0, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB33_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB33_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -2878,8 +2879,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB33_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB33_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB33_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB33_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fsub ptr %p, float 1.0 monotonic, align 4
+@@ -2901,6 +2901,7 @@ define float @float_fmin_monotonic(ptr %p) nounwind {
+ ; LA64F-NEXT:    fmin.s $fa2, $fa2, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB34_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB34_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -2918,8 +2919,7 @@ define float @float_fmin_monotonic(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB34_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB34_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB34_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB34_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -2937,6 +2937,7 @@ define float @float_fmin_monotonic(ptr %p) nounwind {
+ ; LA64D-NEXT:    fmin.s $fa2, $fa2, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB34_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB34_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -2954,8 +2955,7 @@ define float @float_fmin_monotonic(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB34_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB34_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB34_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB34_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fmin ptr %p, float 1.0 monotonic, align 4
+@@ -2977,6 +2977,7 @@ define float @float_fmax_monotonic(ptr %p) nounwind {
+ ; LA64F-NEXT:    fmax.s $fa2, $fa2, $fa1
+ ; LA64F-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64F-NEXT:    movfr2gr.s $a2, $fa0
++; LA64F-NEXT:    addi.w $a2, $a2, 0
+ ; LA64F-NEXT:  .LBB35_3: # %atomicrmw.start
+ ; LA64F-NEXT:    # Parent Loop BB35_1 Depth=1
+ ; LA64F-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -2994,8 +2995,7 @@ define float @float_fmax_monotonic(ptr %p) nounwind {
+ ; LA64F-NEXT:  .LBB35_6: # %atomicrmw.start
+ ; LA64F-NEXT:    # in Loop: Header=BB35_1 Depth=1
+ ; LA64F-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64F-NEXT:    addi.w $a1, $a2, 0
+-; LA64F-NEXT:    bne $a3, $a1, .LBB35_1
++; LA64F-NEXT:    bne $a3, $a2, .LBB35_1
+ ; LA64F-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64F-NEXT:    ret
+ ;
+@@ -3013,6 +3013,7 @@ define float @float_fmax_monotonic(ptr %p) nounwind {
+ ; LA64D-NEXT:    fmax.s $fa2, $fa2, $fa1
+ ; LA64D-NEXT:    movfr2gr.s $a1, $fa2
+ ; LA64D-NEXT:    movfr2gr.s $a2, $fa0
++; LA64D-NEXT:    addi.w $a2, $a2, 0
+ ; LA64D-NEXT:  .LBB35_3: # %atomicrmw.start
+ ; LA64D-NEXT:    # Parent Loop BB35_1 Depth=1
+ ; LA64D-NEXT:    # => This Inner Loop Header: Depth=2
+@@ -3030,8 +3031,7 @@ define float @float_fmax_monotonic(ptr %p) nounwind {
+ ; LA64D-NEXT:  .LBB35_6: # %atomicrmw.start
+ ; LA64D-NEXT:    # in Loop: Header=BB35_1 Depth=1
+ ; LA64D-NEXT:    movgr2fr.w $fa0, $a3
+-; LA64D-NEXT:    addi.w $a1, $a2, 0
+-; LA64D-NEXT:    bne $a3, $a1, .LBB35_1
++; LA64D-NEXT:    bne $a3, $a2, .LBB35_1
+ ; LA64D-NEXT:  # %bb.2: # %atomicrmw.end
+ ; LA64D-NEXT:    ret
+   %v = atomicrmw fmax ptr %p, float 1.0 monotonic, align 4
+-- 
+2.20.1
+
+
+From 4713504e1495a3831ba1a47078de83db3a8a533c Mon Sep 17 00:00:00 2001
+From: Lu Weining <luweining@loongson.cn>
+Date: Thu, 19 Oct 2023 10:16:13 +0800
+Subject: [PATCH 7/7] [Memory] Call __clear_cache in InvalidateInstructionCache
+ on LoongArch (#67285)
+
+As the comments of `InvalidateInstructionCache`: Before the JIT can run
+a block of code that has been emitted it must invalidate the instruction
+cache on some platforms. I think it applies to LoongArch as LoongArch
+has a weak memory-model. But I'm not able to write a test to demonstrate
+this issue. Perhaps self-modifing code should be wrote?
+
+(cherry picked from commit fb366581e7d67df7d9a98605fd65a7e7908451e7)
+---
+ llvm/lib/Support/Unix/Memory.inc | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/llvm/lib/Support/Unix/Memory.inc b/llvm/lib/Support/Unix/Memory.inc
+index 4c8f6b2ea7d3..69bd1164343d 100644
+--- a/llvm/lib/Support/Unix/Memory.inc
++++ b/llvm/lib/Support/Unix/Memory.inc
+@@ -237,7 +237,8 @@ void Memory::InvalidateInstructionCache(const void *Addr, size_t Len) {
+   for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
+     asm volatile("icbi 0, %0" : : "r"(Line));
+   asm volatile("isync");
+-#elif (defined(__arm__) || defined(__aarch64__) || defined(__mips__)) &&       \
++#elif (defined(__arm__) || defined(__aarch64__) || defined(__loongarch__) ||   \
++       defined(__mips__)) &&                                                   \
+     defined(__GNUC__)
+   // FIXME: Can we safely always call this for __GNUC__ everywhere?
+   const char *Start = static_cast<const char *>(Addr);
+-- 
+2.20.1
+
diff --git a/0020-Update-llvm-lit-config-to-support-build_for_openeule.patch b/0020-Update-llvm-lit-config-to-support-build_for_openeule.patch
new file mode 100644
index 0000000..48a4ff5
--- /dev/null
+++ b/0020-Update-llvm-lit-config-to-support-build_for_openeule.patch
@@ -0,0 +1,30 @@
+From 5721be433ddee5f60d4a9434df43a023f1ec4c0e Mon Sep 17 00:00:00 2001
+From: wangqiang <wangqiang1@kylinos.cn>
+Date: Sun, 28 Apr 2024 14:30:34 +0800
+Subject: [PATCH] Update llvm-lit config to support build_for_openeuler
+
+---
+ llvm/cmake/modules/HandleLLVMOptions.cmake | 7 +++++++
+ 1 files changed, 7 insertions(+)
+
+diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
+index 76723be69..c6f5569af 100644
+--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
++++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
+@@ -97,6 +97,13 @@ if( LLVM_ENABLE_ASSERTIONS )
+   set(LLVM_ENABLE_CLASSIC_FLANG 0)
+ endif()
+ 
++option(BUILD_FOR_OPENEULER "Build support for openeuler" OFF)
++if(BUILD_FOR_OPENEULER)
++  set(BUILD_FOR_OPENEULER 1)
++else()
++  set(BUILD_FOR_OPENEULER 0)
++endif()
++
+ if(LLVM_ENABLE_EXPENSIVE_CHECKS)
+   add_compile_definitions(EXPENSIVE_CHECKS)
+ 
+-- 
+2.33.0
+
diff --git a/0021-Add-BiSheng-Autotuner-support-for-LLVM-compiler.patch b/0021-Add-BiSheng-Autotuner-support-for-LLVM-compiler.patch
new file mode 100644
index 0000000..ea61cb3
--- /dev/null
+++ b/0021-Add-BiSheng-Autotuner-support-for-LLVM-compiler.patch
@@ -0,0 +1,9915 @@
+From a9863e2b6e6783aa9be0b9d1d187084fd4b32a3a Mon Sep 17 00:00:00 2001
+From: Muhammad Asif Manzoor <muhammad.asif.manzoor1@huawei.com>
+Date: Thu, 21 Mar 2024 12:50:38 -0400
+Subject: Add BiSheng Autotuner support for LLVM compiler
+
+Automatic tuning is an automatic iterative process that optimizes a given
+program by manipulating compilation options for optimal performance.
+BiSheng Autotuner provides a resumable interface for tuning process. BiSheng
+Autotuner can tune 1) individual code segments/blocks (fine grain turning) like
+loops, callsites, instructions, etc. and 2) entire modules/programs (coarse
+grain tuning) for compiler flags, pass ordering, etc.
+This patch enables LLVM compiler to extract tuneable code regions and then apply
+suggested configuration (by Autotuner) to find out the optimal configurations.
+---
+ llvm/cmake/modules/CrossCompile.cmake         |   1 +
+ llvm/cmake/modules/HandleLLVMOptions.cmake    |   8 +
+ llvm/include/llvm/Analysis/AutotuningDump.h   |  75 ++
+ llvm/include/llvm/Analysis/LoopInfo.h         |  13 +
+ llvm/include/llvm/Analysis/Passes.h           |  10 +
+ llvm/include/llvm/AutoTuner/AutoTuning.h      | 486 ++++++++++++
+ .../llvm/AutoTuner/AutoTuningRemarkManager.h  |  43 ++
+ .../llvm/AutoTuner/AutoTuningRemarkStreamer.h |  47 ++
+ llvm/include/llvm/CodeGen/MachineBasicBlock.h |  13 +
+ llvm/include/llvm/IR/Function.h               |  37 +
+ llvm/include/llvm/IR/InstrTypes.h             |  24 +
+ llvm/include/llvm/IR/Instructions.h           |  24 +
+ llvm/include/llvm/IR/Module.h                 |   3 +
+ llvm/include/llvm/IR/StructuralHash.h         |  14 +
+ llvm/include/llvm/InitializePasses.h          |   5 +
+ llvm/include/llvm/LinkAllPasses.h             |   8 +
+ llvm/include/llvm/Remarks/Remark.h            |  32 +
+ llvm/include/llvm/Support/CommandLine.h       |  17 +
+ llvm/include/llvm/Transforms/Scalar.h         |  17 +
+ .../Transforms/Scalar/AutoTuningCompile.h     | 170 +++++
+ .../llvm/Transforms/Utils/UnrollLoop.h        |   4 +
+ llvm/lib/Analysis/AutotuningDump.cpp          | 265 +++++++
+ llvm/lib/Analysis/CMakeLists.txt              |   2 +
+ llvm/lib/Analysis/InlineAdvisor.cpp           |  18 +
+ llvm/lib/Analysis/InlineCost.cpp              |  29 +
+ llvm/lib/Analysis/LoopInfo.cpp                |  52 ++
+ llvm/lib/AutoTuner/AutoTuning.cpp             | 705 ++++++++++++++++++
+ .../lib/AutoTuner/AutoTuningRemarkManager.cpp | 299 ++++++++
+ .../AutoTuner/AutoTuningRemarkStreamer.cpp    |  55 ++
+ llvm/lib/AutoTuner/CMakeLists.txt             |  11 +
+ llvm/lib/CMakeLists.txt                       |   1 +
+ llvm/lib/CodeGen/CMakeLists.txt               |   1 +
+ llvm/lib/CodeGen/CalcSpillWeights.cpp         |  30 +
+ llvm/lib/CodeGen/MachineBasicBlock.cpp        |  36 +
+ llvm/lib/CodeGen/MachineScheduler.cpp         |  44 ++
+ llvm/lib/CodeGen/SwitchLoweringUtils.cpp      |  19 +
+ llvm/lib/IR/AsmWriter.cpp                     | 151 ++++
+ llvm/lib/IR/CMakeLists.txt                    |   1 +
+ llvm/lib/IR/Function.cpp                      |  34 +
+ llvm/lib/IR/Instructions.cpp                  |  86 +++
+ llvm/lib/IR/StructuralHash.cpp                | 114 +++
+ llvm/lib/Passes/PassBuilder.cpp               |   5 +
+ llvm/lib/Passes/PassBuilderPipelines.cpp      |  46 ++
+ llvm/lib/Passes/PassRegistry.def              |  13 +
+ llvm/lib/Passes/StandardInstrumentations.cpp  |  23 +
+ .../lib/Remarks/BitstreamRemarkSerializer.cpp |   8 +
+ llvm/lib/Remarks/RemarkStreamer.cpp           |   4 +
+ llvm/lib/Remarks/YAMLRemarkParser.cpp         | 122 +++
+ llvm/lib/Remarks/YAMLRemarkParser.h           |   6 +
+ llvm/lib/Remarks/YAMLRemarkSerializer.cpp     |  84 +++
+ llvm/lib/Support/CommandLine.cpp              |  41 +
+ llvm/lib/Transforms/IPO/CMakeLists.txt        |   1 +
+ llvm/lib/Transforms/IPO/Inliner.cpp           |  36 +
+ llvm/lib/Transforms/IPO/SampleProfile.cpp     |  14 +
+ .../Transforms/Instrumentation/CMakeLists.txt |   1 +
+ .../Instrumentation/PGOInstrumentation.cpp    |   8 +
+ .../Transforms/Scalar/AutoTuningCompile.cpp   | 334 +++++++++
+ llvm/lib/Transforms/Scalar/CMakeLists.txt     |   2 +
+ llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 187 +++++
+ llvm/lib/Transforms/Scalar/Scalar.cpp         |   4 +
+ llvm/lib/Transforms/Scalar/Sink.cpp           |   5 +
+ llvm/lib/Transforms/Utils/CMakeLists.txt      |   1 +
+ llvm/lib/Transforms/Utils/LCSSA.cpp           |   5 +
+ llvm/lib/Transforms/Utils/LoopSimplify.cpp    |   8 +
+ llvm/lib/Transforms/Utils/LoopUnroll.cpp      |   3 +
+ llvm/lib/Transforms/Vectorize/CMakeLists.txt  |   1 +
+ .../Vectorize/LoopVectorizationLegality.cpp   |  12 +
+ .../Transforms/Vectorize/LoopVectorize.cpp    |  34 +
+ .../Inputs/unroll_template.yaml               |   8 +
+ .../AutotuningDump/create-data-dir.ll         |  65 ++
+ llvm/test/AutoTuning/AutotuningDump/unroll.ll |  35 +
+ .../autotune_datadir/baseline_config.yaml     |   9 +
+ .../autotune_datadir/random_config.yaml       |   9 +
+ .../AutoTuning/BaselineConfig/Inputs/test.ll  | 117 +++
+ .../BaselineConfig/apply_baseline_config.ll   |  11 +
+ llvm/test/AutoTuning/BaselineConfig/opp.ll    |  67 ++
+ .../CodeRegionFilter/function-filtering.ll    |  62 ++
+ .../Error/Inputs/invalid-format.yaml          |   3 +
+ .../AutoTuning/Error/Inputs/template.yaml     |  10 +
+ .../AutoTuning/Error/file-not-found-error.ll  |  29 +
+ .../AutoTuning/Error/invalid-yaml-error.ll    |  27 +
+ .../AutoTuning/Error/malformed-input-error.ll | 136 ++++
+ llvm/test/AutoTuning/Error/output-error.ll    |  28 +
+ llvm/test/AutoTuning/Error/valid-input.ll     |  27 +
+ .../Inputs/template.yaml                      |   9 +
+ .../inc-compile-parse-input.ll                | 103 +++
+ .../AutoTuning/Inline/Inputs/template.yaml    |   9 +
+ .../Inline/Inputs/template_no_metadata.yaml   |   7 +
+ .../test/AutoTuning/Inline/duplicate-calls.ll |  96 +++
+ llvm/test/AutoTuning/Inline/force-inline.ll   |  84 +++
+ .../AutoTuning/Inline/inline-attribute.ll     |  85 +++
+ llvm/test/AutoTuning/Inline/opp.ll            |  64 ++
+ .../LoopUnroll/Inputs/debug_loc_template.yaml |  10 +
+ .../LoopUnroll/Inputs/loop_nest.yaml          |  10 +
+ .../LoopUnroll/Inputs/loop_peel.yaml          |   9 +
+ .../Inputs/unroll_raw_template.yaml           |  10 +
+ .../LoopUnroll/Inputs/unroll_template.yaml    |  10 +
+ .../Inputs/unroll_template_no_metadata.yaml   |   8 +
+ llvm/test/AutoTuning/LoopUnroll/debug_loc.ll  | 161 ++++
+ .../AutoTuning/LoopUnroll/dynamic_config.ll   |  56 ++
+ llvm/test/AutoTuning/LoopUnroll/loop_nest.ll  | 136 ++++
+ llvm/test/AutoTuning/LoopUnroll/loop_peel.ll  |  53 ++
+ .../AutoTuning/LoopUnroll/unroll-pragma.ll    | 129 ++++
+ llvm/test/AutoTuning/LoopUnroll/unroll.ll     | 101 +++
+ llvm/test/AutoTuning/LoopUnroll/unroll_raw.ll | 113 +++
+ .../Inputs/vectorize_template.yaml            |   9 +
+ .../vectorize_template_no_metadata.yaml       |   7 +
+ .../LoopVectorize/force-vector-interleave.ll  |  88 +++
+ .../Inputs/misched_x86_template.yaml          |  10 +
+ .../misched_x86_bidirectional.ll              |  73 ++
+ .../MachineScheduler/misched_x86_bottomup.ll  |  72 ++
+ .../MachineScheduler/misched_x86_topdown.ll   |  72 ++
+ .../AutoTuning/MetaData/structural_hash.ll    | 234 ++++++
+ .../AutoTuning/MetaData/write_no_metadata.ll  | 191 +++++
+ .../MetaData/write_with_metadata.ll           | 204 +++++
+ .../AutoTuning/PGO/Inputs/pgo-instr.proftext  |  17 +
+ .../PGO/Inputs/pgo-sample-cold.prof           |   7 +
+ .../AutoTuning/PGO/Inputs/pgo-sample-hot.prof |   7 +
+ llvm/test/AutoTuning/PGO/pgo-instr-filters.ll |  61 ++
+ .../test/AutoTuning/PGO/pgo-sample-filters.ll | 138 ++++
+ .../Inputs/pass_invocation.yaml               |  10 +
+ .../PassInvocation/pass_invocation_read.ll    |  64 ++
+ .../PassInvocation/pass_invocation_write.ll   |  67 ++
+ .../PhaseOrdering/Inputs/template.yaml        |   8 +
+ .../AutoTuning/PhaseOrdering/pass-order.ll    |  65 ++
+ .../AutoTuning/SwitchLowering/switch-opp.ll   |  47 ++
+ llvm/test/AutoTuning/lit.local.cfg            |   2 +
+ llvm/test/AutoTuning/opt-opp.ll               | 315 ++++++++
+ llvm/test/lit.site.cfg.py.in                  |   1 +
+ llvm/tools/llc/llc.cpp                        |  19 +
+ llvm/tools/opt/NewPMDriver.cpp                |  42 ++
+ llvm/tools/opt/opt.cpp                        |  53 ++
+ 132 files changed, 7801 insertions(+)
+ create mode 100644 llvm/include/llvm/Analysis/AutotuningDump.h
+ create mode 100644 llvm/include/llvm/AutoTuner/AutoTuning.h
+ create mode 100644 llvm/include/llvm/AutoTuner/AutoTuningRemarkManager.h
+ create mode 100644 llvm/include/llvm/AutoTuner/AutoTuningRemarkStreamer.h
+ create mode 100644 llvm/include/llvm/Transforms/Scalar/AutoTuningCompile.h
+ create mode 100644 llvm/lib/Analysis/AutotuningDump.cpp
+ create mode 100644 llvm/lib/AutoTuner/AutoTuning.cpp
+ create mode 100644 llvm/lib/AutoTuner/AutoTuningRemarkManager.cpp
+ create mode 100644 llvm/lib/AutoTuner/AutoTuningRemarkStreamer.cpp
+ create mode 100644 llvm/lib/AutoTuner/CMakeLists.txt
+ create mode 100644 llvm/lib/Transforms/Scalar/AutoTuningCompile.cpp
+ create mode 100644 llvm/test/AutoTuning/AutotuningDump/Inputs/unroll_template.yaml
+ create mode 100644 llvm/test/AutoTuning/AutotuningDump/create-data-dir.ll
+ create mode 100644 llvm/test/AutoTuning/AutotuningDump/unroll.ll
+ create mode 100644 llvm/test/AutoTuning/BaselineConfig/Inputs/autotune_datadir/baseline_config.yaml
+ create mode 100644 llvm/test/AutoTuning/BaselineConfig/Inputs/autotune_datadir/random_config.yaml
+ create mode 100644 llvm/test/AutoTuning/BaselineConfig/Inputs/test.ll
+ create mode 100644 llvm/test/AutoTuning/BaselineConfig/apply_baseline_config.ll
+ create mode 100644 llvm/test/AutoTuning/BaselineConfig/opp.ll
+ create mode 100644 llvm/test/AutoTuning/CodeRegionFilter/function-filtering.ll
+ create mode 100644 llvm/test/AutoTuning/Error/Inputs/invalid-format.yaml
+ create mode 100644 llvm/test/AutoTuning/Error/Inputs/template.yaml
+ create mode 100644 llvm/test/AutoTuning/Error/file-not-found-error.ll
+ create mode 100644 llvm/test/AutoTuning/Error/invalid-yaml-error.ll
+ create mode 100644 llvm/test/AutoTuning/Error/malformed-input-error.ll
+ create mode 100644 llvm/test/AutoTuning/Error/output-error.ll
+ create mode 100644 llvm/test/AutoTuning/Error/valid-input.ll
+ create mode 100644 llvm/test/AutoTuning/IncrementalCompilation/Inputs/template.yaml
+ create mode 100644 llvm/test/AutoTuning/IncrementalCompilation/inc-compile-parse-input.ll
+ create mode 100644 llvm/test/AutoTuning/Inline/Inputs/template.yaml
+ create mode 100644 llvm/test/AutoTuning/Inline/Inputs/template_no_metadata.yaml
+ create mode 100644 llvm/test/AutoTuning/Inline/duplicate-calls.ll
+ create mode 100644 llvm/test/AutoTuning/Inline/force-inline.ll
+ create mode 100644 llvm/test/AutoTuning/Inline/inline-attribute.ll
+ create mode 100644 llvm/test/AutoTuning/Inline/opp.ll
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/Inputs/debug_loc_template.yaml
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/Inputs/loop_nest.yaml
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/Inputs/loop_peel.yaml
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_raw_template.yaml
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_template.yaml
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_template_no_metadata.yaml
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/debug_loc.ll
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/dynamic_config.ll
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/loop_nest.ll
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/loop_peel.ll
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/unroll-pragma.ll
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/unroll.ll
+ create mode 100644 llvm/test/AutoTuning/LoopUnroll/unroll_raw.ll
+ create mode 100644 llvm/test/AutoTuning/LoopVectorize/Inputs/vectorize_template.yaml
+ create mode 100644 llvm/test/AutoTuning/LoopVectorize/Inputs/vectorize_template_no_metadata.yaml
+ create mode 100644 llvm/test/AutoTuning/LoopVectorize/force-vector-interleave.ll
+ create mode 100644 llvm/test/AutoTuning/MachineScheduler/Inputs/misched_x86_template.yaml
+ create mode 100644 llvm/test/AutoTuning/MachineScheduler/misched_x86_bidirectional.ll
+ create mode 100644 llvm/test/AutoTuning/MachineScheduler/misched_x86_bottomup.ll
+ create mode 100644 llvm/test/AutoTuning/MachineScheduler/misched_x86_topdown.ll
+ create mode 100644 llvm/test/AutoTuning/MetaData/structural_hash.ll
+ create mode 100644 llvm/test/AutoTuning/MetaData/write_no_metadata.ll
+ create mode 100644 llvm/test/AutoTuning/MetaData/write_with_metadata.ll
+ create mode 100644 llvm/test/AutoTuning/PGO/Inputs/pgo-instr.proftext
+ create mode 100644 llvm/test/AutoTuning/PGO/Inputs/pgo-sample-cold.prof
+ create mode 100644 llvm/test/AutoTuning/PGO/Inputs/pgo-sample-hot.prof
+ create mode 100644 llvm/test/AutoTuning/PGO/pgo-instr-filters.ll
+ create mode 100644 llvm/test/AutoTuning/PGO/pgo-sample-filters.ll
+ create mode 100644 llvm/test/AutoTuning/PassInvocation/Inputs/pass_invocation.yaml
+ create mode 100644 llvm/test/AutoTuning/PassInvocation/pass_invocation_read.ll
+ create mode 100644 llvm/test/AutoTuning/PassInvocation/pass_invocation_write.ll
+ create mode 100644 llvm/test/AutoTuning/PhaseOrdering/Inputs/template.yaml
+ create mode 100644 llvm/test/AutoTuning/PhaseOrdering/pass-order.ll
+ create mode 100644 llvm/test/AutoTuning/SwitchLowering/switch-opp.ll
+ create mode 100644 llvm/test/AutoTuning/lit.local.cfg
+ create mode 100644 llvm/test/AutoTuning/opt-opp.ll
+
+diff --git a/llvm/cmake/modules/CrossCompile.cmake b/llvm/cmake/modules/CrossCompile.cmake
+index 6af47b51d4c6..1a9fb4b2dddc 100644
+--- a/llvm/cmake/modules/CrossCompile.cmake
++++ b/llvm/cmake/modules/CrossCompile.cmake
+@@ -82,6 +82,7 @@ function(llvm_create_cross_target project_name target_name toolchain buildtype)
+         -DLLVM_ENABLE_PROJECTS="${llvm_enable_projects_arg}"
+         -DLLVM_EXTERNAL_PROJECTS="${llvm_external_projects_arg}"
+         -DLLVM_ENABLE_RUNTIMES="${llvm_enable_runtimes_arg}"
++        -DLLVM_ENABLE_AUTOTUNER="${LLVM_ENABLE_AUTOTUNER}"
+         ${external_project_source_dirs}
+         -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN="${LLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN}"
+         -DLLVM_INCLUDE_BENCHMARKS=OFF
+diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
+index 62a1a64d37d4..b8e9dbe29d88 100644
+--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
++++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
+@@ -112,6 +112,14 @@ else()
+   set(BUILD_FOR_OPENEULER 0)
+ endif()
+ 
++option(LLVM_ENABLE_AUTOTUNER "Enable BiSheng Auto-Tuning features" OFF)
++if (LLVM_ENABLE_AUTOTUNER)
++  set(LLVM_ENABLE_AUTOTUNER 1)
++  add_definitions( -DENABLE_AUTOTUNER )
++else()
++  set(LLVM_ENABLE_AUTOTUNER 0)
++endif()
++
+ if(LLVM_ENABLE_EXPENSIVE_CHECKS)
+   add_compile_definitions(EXPENSIVE_CHECKS)
+ 
+diff --git a/llvm/include/llvm/Analysis/AutotuningDump.h b/llvm/include/llvm/Analysis/AutotuningDump.h
+new file mode 100644
+index 000000000000..fb973f05323e
+--- /dev/null
++++ b/llvm/include/llvm/Analysis/AutotuningDump.h
+@@ -0,0 +1,75 @@
++#if defined(ENABLE_AUTOTUNER)
++// ===-- AutotuningDump.h - Auto-Tuning-----------------------------------===//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++// ===--------------------------------------------------------------------===//
++//
++// This file contains pass collecting IR of tuned regions and storing them into
++// predetrmined locations, to be used later by autotuning ML guidance
++//
++// ===--------------------------------------------------------------------===//
++
++#include "llvm/Analysis/LoopInfo.h"
++#include "llvm/Analysis/LoopPass.h"
++#include "llvm/IR/PassManager.h"
++#include "llvm/Transforms/Scalar/LoopPassManager.h"
++#include <string>
++
++namespace llvm {
++class AutotuningDump {
++public:
++  AutotuningDump(bool IncrementalCompilation = false);
++  bool run(Module &F, function_ref<LoopInfo &(Function &)> GetLI);
++
++private:
++  std::string AutoTuneDirPath;
++  std::unique_ptr<raw_ostream> createFile(const Twine &File);
++  int getConfigNumber();
++  void dumpToStream(llvm::raw_ostream &os, const Loop &L) const;
++  void dumpToStream(llvm::raw_ostream &os, const Function &F) const;
++  void dumpFunctions(llvm::Module &M);
++  void dumpLoops(llvm::Module &M, function_ref<LoopInfo &(Function &)> GetLI);
++  void dumpModule(llvm::Module &M);
++  std::string getDirectoryName(const std::string File) const;
++  std::string getFileName(std::string FilePath);
++
++  bool IsIncrementalCompilation;
++};
++
++class AutotuningDumpLegacy : public ModulePass {
++public:
++  static char ID;
++  AutotuningDumpLegacy(bool IncrementalCompilation = false);
++  StringRef getPassName() const override;
++  bool runOnModule(Module &M) override;
++  void getAnalysisUsage(AnalysisUsage &AU) const override;
++
++private:
++  bool IsIncrementalCompilation;
++};
++
++class AutotuningDumpAnalysis
++    : public AnalysisInfoMixin<AutotuningDumpAnalysis> {
++  friend AnalysisInfoMixin<AutotuningDumpAnalysis>;
++  static AnalysisKey Key;
++
++public:
++  AutotuningDumpAnalysis(bool IncrementalCompilation = false) {
++    IsIncrementalCompilation = IncrementalCompilation;
++  }
++
++  // This pass only prints IRs of selected function or loops without doing any
++  // real analyses, thus the return value is meaningless. To avoid leaking data
++  // or memory, we typedef Result to Optional<bool> to avoid having to return an
++  // AutotuningDump object.
++  using Result = std::optional<bool>;
++  Result run(Module &M, ModuleAnalysisManager &AM);
++
++private:
++  bool IsIncrementalCompilation;
++};
++} // namespace llvm
++#endif
+\ No newline at end of file
+diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
+index 3434630c27cf..9be3e056cf76 100644
+--- a/llvm/include/llvm/Analysis/LoopInfo.h
++++ b/llvm/include/llvm/Analysis/LoopInfo.h
+@@ -26,6 +26,9 @@
+ #include <algorithm>
+ #include <optional>
+ #include <utility>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ 
+ namespace llvm {
+ 
+@@ -44,7 +47,12 @@ extern template class LoopBase<BasicBlock, Loop>;
+ 
+ /// Represents a single loop in the control flow graph.  Note that not all SCCs
+ /// in the CFG are necessarily loops.
++#if defined(ENABLE_AUTOTUNER)
++class LLVM_EXTERNAL_VISIBILITY Loop : public LoopBase<BasicBlock, Loop>,
++                                      public autotuning::Container {
++#else
+ class LLVM_EXTERNAL_VISIBILITY Loop : public LoopBase<BasicBlock, Loop> {
++#endif
+ public:
+   /// A range representing the start and end location of a loop.
+   class LocRange {
+@@ -395,6 +403,11 @@ public:
+     return "<unnamed loop>";
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  void initCodeRegion() override;
++  uint64_t computeStructuralHash() override;
++#endif
++
+ private:
+   Loop() = default;
+ 
+diff --git a/llvm/include/llvm/Analysis/Passes.h b/llvm/include/llvm/Analysis/Passes.h
+index ac1bc3549910..65f566cc75de 100644
+--- a/llvm/include/llvm/Analysis/Passes.h
++++ b/llvm/include/llvm/Analysis/Passes.h
+@@ -58,6 +58,16 @@ namespace llvm {
+   // in a function and builds the region hierarchy.
+   //
+   FunctionPass *createRegionInfoPass();
++
++#if defined(ENABLE_AUTOTUNER)
++  //===--------------------------------------------------------------------===//
++  //
++  // createAutotuningDumpPass - This pass collects IR of tuned regions
++  // and stores them into predetrmined locations.
++  // for the purpose of autotuning ML guidance
++  //
++  ModulePass *createAutotuningDumpPass();
++#endif
+ }
+ 
+ #endif
+diff --git a/llvm/include/llvm/AutoTuner/AutoTuning.h b/llvm/include/llvm/AutoTuner/AutoTuning.h
+new file mode 100644
+index 000000000000..0f1f276306ec
+--- /dev/null
++++ b/llvm/include/llvm/AutoTuner/AutoTuning.h
+@@ -0,0 +1,486 @@
++#if defined(ENABLE_AUTOTUNER)
++//===-- AutoTuning.h - Auto-Tuning-----------------------------------------===//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++//===----------------------------------------------------------------------===//
++//
++// This file defines Auto Tuning related functions, models and interfaces.
++//
++//===----------------------------------------------------------------------===//
++
++#ifndef LLVM_AUTOTUNER_AUTOTUNING_H_
++#define LLVM_AUTOTUNER_AUTOTUNING_H_
++
++#include "llvm/ADT/DenseMapInfo.h"
++#include "llvm/ADT/Hashing.h"
++#include "llvm/ADT/SetVector.h"
++#include "llvm/ADT/SmallVector.h"
++#include "llvm/IR/DebugInfoMetadata.h"
++#include "llvm/IR/DebugLoc.h"
++#include "llvm/Support/Casting.h"
++#include <map>
++#include <memory>
++#include <string>
++#include <unordered_map>
++#include <unordered_set>
++
++// Options for AutoTuner incremental compilation.
++enum AutoTuningCompileOpt {
++  Inactive,    // Disabled incremental compilation.
++  CoarseGrain, // For tuning LLVMParam.
++  FineGrain,   // For tuning default code regions (Loop, CallSite, Function).
++  Basic        // Same as CoarseGrain but can be applied for any code region.
++               // Can be used with ImpactRanker.
++};
++
++namespace autotuning {
++// Constant defintion for AutoTuner incremental compilation.
++const std::string CompileOptionStart = "start";
++const std::string CompileOptionEnd = "end";
++const std::string CompileOptionUnknow = "unknown";
++const std::string CompileOptionUnroll = "loop-unroll";
++const std::string CompileOptionVectorize = "loop-vectorize";
++const std::string CompileOptionInline = "inline";
++
++class ParameterBase {
++public:
++  virtual ~ParameterBase() = default;
++  enum ParameterKind {
++    PK_PARAMETER,
++  };
++  ParameterKind getKind() const { return Kind; }
++
++  explicit ParameterBase(ParameterKind K) : Kind(K) {}
++
++private:
++  const ParameterKind Kind;
++};
++
++template <typename T> class Parameter : public ParameterBase {
++public:
++  Parameter(const T &RHS) : ParameterBase(PK_PARAMETER), Value(RHS) {}
++  const T &getValue() const { return Value; }
++  void setValue(const T &RHS) { Value = RHS; }
++
++  static bool classof(const ParameterBase *P) {
++    return P->getKind() == PK_PARAMETER;
++  }
++
++private:
++  T Value;
++};
++
++/// This class manages parameters of one codeRegion.
++class ParameterManager {
++
++public:
++  // add a param into this ParameterManager
++  template <typename T>
++  void add(const std::string &ParamName, const T ParamValue) {
++    std::shared_ptr<ParameterBase> Param =
++        std::make_shared<Parameter<T>>(ParamValue);
++    this->Parameters[ParamName] = Param;
++  }
++
++  // Look up the value of a parameter by name in this ParameterManager.
++  // The found value will be assigned to the reference variable "Value".
++  // Return true if the parameter exits in this ParameterManager,
++  // and false otherwise.
++  template <typename T>
++  bool findByName(const std::string &ParamName, T &Value) const {
++    auto Iterator = Parameters.find(ParamName);
++    if (Iterator == Parameters.end()) {
++      return false;
++    }
++
++    auto ParamPtr = llvm::dyn_cast<Parameter<T>>(Iterator->second.get());
++    if (ParamPtr != nullptr) {
++      Value = ParamPtr->getValue();
++      return true;
++    } else {
++      return false;
++    }
++  }
++
++private:
++  std::unordered_map<std::string, std::shared_ptr<ParameterBase>> Parameters;
++};
++
++/// The debug location used to track a CodeRegion back to the source file.
++struct SourceLocation {
++  ///  The source file corresponding to this CodeRegion.
++  std::string SourceFilePath;
++  unsigned SourceLine = 0;
++  unsigned SourceColumn = 0;
++
++  bool operator==(const SourceLocation &CR) const {
++    return (this->SourceFilePath == CR.SourceFilePath) &&
++           (this->SourceLine == CR.SourceLine) &&
++           (this->SourceColumn == CR.SourceColumn);
++  };
++
++  explicit operator bool() const {
++    return !(SourceFilePath.empty() && SourceLine == 0 && SourceColumn == 0);
++  }
++};
++
++enum CodeRegionType {
++  CallSite,          // Code region for function inlining.
++  Function,          // Used in AutoTuningDump pass for IR writing.
++  LLVMParam,         // Compilation flags. Tuned individually for each module.
++  Loop,              // Code region for loops.
++  MachineBasicBlock, // Instruction scheduling code region.
++  Other,             // Pass ordering code region.
++  ProgramParam,      // Compilation flags. Tuned collectively for program.
++  Switch,            // Tuning MinJumpTableEntries parameter for switch inst.
++  Empty,             // Empty CodeRegion.
++  Invalid            // Invalid CodeRegion.
++};
++
++enum HotnessType {
++  Unknown,
++  Cold,
++  Hot,
++};
++
++/// DynamicOptions represent a map: Arg -> DynamicConfigs.
++/// Where Arg is a tuning parameter on the associated CodeRegion.
++/// And DynamicConfigs is the possible tuning values associated with Arg.
++typedef std::map<std::string, std::vector<unsigned int>> DynamicOptions;
++
++/// This class represents a region in source code including
++/// its name, function name, type, debug location, and associated pass name.
++class CodeRegion {
++
++public:
++  // Default constructor
++  CodeRegion(const CodeRegionType Type = CodeRegionType::Other);
++  ~CodeRegion() = default;
++  // Concrete constructors
++  CodeRegion(const std::string &Name, const std::string &FuncName,
++             const CodeRegionType &Type, const llvm::DebugLoc &DL,
++             const DynamicOptions DO = {});
++  CodeRegion(const std::string &Name, const std::string &FuncName,
++             const CodeRegionType &Type,
++             const SourceLocation &Location = SourceLocation(),
++             const DynamicOptions DO = {});
++  CodeRegion(const std::string &Name, const std::string &FuncName,
++             const std::string &PassName, const CodeRegionType &Type,
++             const SourceLocation &Location = SourceLocation(),
++             const unsigned int Invocation = 0);
++
++  bool operator==(const CodeRegion &CR) const;
++  inline bool operator!=(const CodeRegion &CR) const { return !(*this == CR); };
++
++  explicit operator bool() const {
++    return !(Name.empty() && FuncName.empty() && PassName.empty());
++  }
++
++  static std::string getTypeAsString(CodeRegionType CRType);
++  static std::string getHotnessAsString(HotnessType Hotness);
++  const std::string &getName() const { return Name; }
++  const std::string &getFuncName() const { return FuncName; }
++  const CodeRegionType &getType() const { return Type; }
++  const std::string &getFileName() const { return Location.SourceFilePath; }
++  const std::string &getTypeAsString() const { return StringType; }
++  const SourceLocation &getSourceLoc() const { return Location; }
++  const std::string &getPassName() const { return PassName; }
++  unsigned getSize() const { return Size; };
++  void setPassName(const std::string &NewPassName);
++  void setSize(unsigned Size) { this->Size = Size; };
++  void setHotness(HotnessType NewHotness) const { this->Hotness = NewHotness; }
++  HotnessType getHotness() const { return this->Hotness; }
++  std::string getHotnessAsString() const { return getHotnessAsString(Hotness); }
++  bool isCold() const { return this->Hotness == Cold; }
++  bool isHot() const { return this->Hotness == Hot; }
++  std::uint64_t getHash() const { return this->Hash; }
++  void setHash(std::uint64_t Hash) { this->Hash = Hash; }
++  DynamicOptions getAutoTunerOptions() const { return this->AutoTunerOptions; }
++  void setInvocation(unsigned int Invocation) { this->Invocation = Invocation; }
++  unsigned int getInvocation() const { return this->Invocation; }
++
++  /// Add dynamic config options with Code Region for AutoTuner to tune instead
++  /// of using static config options.
++  void addAutoTunerOptions(const std::string ParamName,
++                           std::vector<unsigned int> Options) const {
++    this->AutoTunerOptions.insert(
++        std::pair<std::string, std::vector<unsigned int>>(ParamName, Options));
++  }
++  static CodeRegion getInvalidInstance();
++  static CodeRegion getEmptyInstance();
++  void setBaselineConfig(std::map<std::string, std::string> Value) const {
++    this->BaselineConfig = Value;
++  };
++  std::map<std::string, std::string> getBaselineConfig() const {
++    return this->BaselineConfig;
++  }
++
++private:
++  /// Name of the code region.
++  /// For most of cases it's set to the name of a header basic block.
++  std::string Name;
++  /// Function name of this code region if any.
++  std::string FuncName;
++  /// Name of the pass which this code region is associated.
++  std::string PassName;
++  /// Type of this code region. Options are other, function, loop,
++  /// and machine basic block.
++  CodeRegionType Type;
++  /// Source Location.
++  SourceLocation Location;
++  std::string StringType;
++  /// Structural hash for the CodeRegion.
++  std::uint64_t Hash = 0;
++  /// Configs values passed to AutoTuner for dynamic setting of search space
++  /// for code regions.
++  mutable DynamicOptions AutoTunerOptions;
++  /// Configuration values passed to AutoTuner for generating the same binary
++  /// as the baseline.
++  mutable std::map<std::string, std::string> BaselineConfig;
++
++  /// Record the order of invocation of an optimization pass during the whole
++  /// compilation pipeline. It is used to differentiate multiple invocations of
++  /// a same optimization pass.
++  /// Currently, Loop Unroll pass is invoked twice during the compilation
++  /// pipeline. 'Invocation' helps to relate a code region with the invocation
++  /// of Loop Unroll pass where the code region is generated.
++  mutable unsigned int Invocation;
++
++  /// Size of this code region. Usually it refers to the number of instructions
++  /// but could be different based on implementations.
++  unsigned Size = 0;
++  mutable HotnessType Hotness = Unknown;
++
++  /// A boolean flag to record if a CR is initialized or not.
++  /// It should only be set to true by initContainer().
++  /// We only add initialized CR to TuningOpps.
++  bool Initialized = false;
++
++  friend class AutoTuningEngine;
++};
++
++/// This class is an interface for classes representing code regions in LLVM
++/// (eg. Loop, Function and MachineBasicBlock) to inherit
++/// so that auto-tuning can be enabled on them.
++/// A Container must contain a CodeRegion.
++class Container {
++
++public:
++  Container() {}
++  virtual ~Container(){};
++
++  /// Abstract method for derived classes to overwrite
++  virtual void initCodeRegion() = 0;
++  virtual uint64_t computeStructuralHash() = 0;
++
++  /// Get the Container's CodeRegion.
++  const CodeRegion &getCodeRegion() const;
++  /// Set the Container's CodeRegion.
++  void setCodeRegion(const CodeRegion &NewCR);
++  /// This method is to look up the value of a parameter that corresponds to an
++  /// Container. The parameter being looked up is stored in a ParameterManager.
++  template <typename T>
++  bool lookUpParams(const std::string &ParamsName, T &Value) const;
++
++  /// Check if the code region is being tuned by config file.
++  bool requiresIRDump(bool IsFunctionIR = false) const;
++
++private:
++  CodeRegion CR;
++  friend class AutoTuningEngine;
++};
++} // end namespace autotuning
++
++namespace std {
++template <>
++// Implement hash for CodeRegion data type in std namespace. Only using common
++// attributes (with and without using 'OmitAutotuningMetadata' flag) of
++// CodeRegion. Remaining attributes are compared in overloaded == function.
++struct hash<autotuning::CodeRegion> {
++  std::size_t operator()(const autotuning::CodeRegion &CR) const {
++    return llvm::hash_combine(CR.getPassName(), CR.getType());
++  }
++};
++} // namespace std
++
++namespace llvm {
++// Forward Decleration.
++class CallBase;
++
++typedef autotuning::CodeRegion CodeRegion;
++template <> struct DenseMapInfo<CodeRegion> {
++  static bool isEqual(const CodeRegion &LHS, const CodeRegion &RHS) {
++    return LHS == RHS;
++  }
++  static inline CodeRegion getEmptyKey() {
++    return autotuning::CodeRegion::getEmptyInstance();
++  }
++  static inline CodeRegion getTombstoneKey() {
++    return autotuning::CodeRegion::getInvalidInstance();
++  }
++  // Implement hash for CodeRegion data type in llvm namespace. Only using
++  // common attributes (with and without using 'OmitAutotuningMetadata' flag)
++  // of CodeRegion. Remaining attributes are compared in overloaded ==
++  // function.
++  static unsigned getHashValue(const CodeRegion &CR) {
++    return llvm::hash_combine(CR.getPassName(), CR.getType());
++  }
++};
++} // namespace llvm
++
++namespace autotuning {
++using namespace llvm;
++typedef std::unordered_map<CodeRegion, ParameterManager> LookUpTable;
++typedef llvm::SetVector<CodeRegion> CodeRegions;
++
++/// Structure to store information of CallSite code regions which is used to
++/// get a different SourceLocation for multiple callsites (same callee) in a
++/// function when these callsites have same SourceLocation due to inlining.
++struct CallSiteLocation {
++  llvm::CallBase *CB;
++  llvm::Function *Caller;
++  llvm::Function *Callee;
++  SourceLocation SrcLoc;
++};
++
++class AutoTuningEngine {
++public:
++  AutoTuningEngine() { Enabled = false; }
++  ~AutoTuningEngine() {}
++
++  /// Initialize the Container for auto-tuning.
++  void initContainer(Container *Container, const std::string &PassName,
++                     const StringRef FuncName = "", bool AddOpportunity = true,
++                     unsigned int Invocation = 0);
++
++  /// Initialize auto-tuning. This method should only be called in the main
++  /// function.
++  /// \return Error::success() on success or the related Error otherwise.
++  llvm::Error init(const std::string &ModuleID);
++
++  /// Finalize auto-tuning. This method should only be called in the main
++  /// function.
++  /// \return Error::success() on success or the related Error otherwise.
++  llvm::Error finalize();
++
++  /// Return the number of tuning configuration used for this compilation.
++  llvm::Expected<int> getConfigNumber();
++
++  void enable() { Enabled = true; }
++  void disable() { Enabled = false; }
++  bool isEnabled() const { return Enabled; }
++  bool isMLEnabled() const { return MLEnabled; }
++  bool isDumpEnabled() const { return DumpEnabled; }
++  bool isGenerateOutput() const { return GenerateOutput; }
++  bool isParseInput() const { return ParseInput; }
++  bool isTuningAllowedForType(CodeRegionType CRType) const {
++    return (CodeRegionFilterTypes.count(CRType) > 0);
++  }
++  bool isThinLTOTuning() const;
++
++  /// Convert a pass-name to CodeRegionType.
++  CodeRegionType convertPassToType(std::string Pass);
++
++  /// First sets BaselineConfig value for the CR then
++  /// add a tuning opportunity into the TuningOpps list.
++  void addOpportunity(const CodeRegion &OppCR,
++                      std::map<std::string, std::string> BaselineConfig = {});
++  bool hasOpportunities() const { return TuningOpps.empty(); }
++
++  bool shouldRunOptPass(std::string FileName, std::string Pass);
++
++  /// Insert all of the callsites of a function in CallSiteLocs vector.
++  void insertCallSiteLoc(CallSiteLocation Loc);
++
++  /// Update CallSiteLocs vector with new callsites (if any) which get available
++  /// due to inlining.
++  void updateCallSiteLocs(llvm::CallBase *CB, llvm::CallBase *Ptr,
++                          llvm::Function *F, unsigned int Line);
++
++  /// Clean up the CallSiteLocs vector by keeping the callsite if there are
++  /// multiple calls to same callee. This cleaning will be perform before
++  /// inlining any callsite.
++  void cleanCallSiteLoc();
++
++  /// clear the CallSiteLocs vector.
++  void clearCallSiteLocs();
++
++  /// Return the SourceLocation::SourceLine (if available).
++  std::optional<unsigned int> getCallSiteLoc(llvm::CallBase *CB);
++
++  template <typename T>
++  bool lookUpGlobalParams(const std::string &ParamsName, T &Value) const;
++  /// A map storing llvm parameters.
++  std::unordered_map<std::string, std::string> LLVMParams;
++  /// A map storing program parameters.
++  std::unordered_map<std::string, std::string> ProgramParams;
++
++private:
++  std::string ModuleID;
++  /// This boolean indicates if the auto-tuning mode is enabled.
++  /// It will be set to true if the any of the following command line options
++  /// (auto-tuning-input, auto-tuning-result and auto-tuning-opp) is specified.
++  bool Enabled;
++  /// This boolean indicates if the ML guidance feature is enabled in
++  /// Autotuner. It will be set to true if -fautotune-rank is specified.
++  bool MLEnabled;
++  /// This boolean indicates if the IR dumping is enabled or not. IR dumping
++  /// is enabled for ML guidance feature. It can also be enabled with command
++  /// line compiler flag 'enable-autotuning-dump'.
++  bool DumpEnabled = false;
++  /// This boolean indicates if compiler is parsing/using 'config.yaml' file
++  /// generated by AutoTuner and use the configuration values instead of
++  /// determining with compiler heuristic.
++  bool ParseInput;
++  /// This boolean indicates if compiler is creating/generating opportunity
++  /// file(s) which will be consumed by AutoTuner to create the search space.
++  bool GenerateOutput;
++  /// A map of filename and set of optimization passes; an optimization pass
++  /// will be added to this set if a CodeRegion belongs to the optimization
++  /// pass.
++  std::unordered_map<std::string, std::unordered_set<std::string>> OppPassList;
++
++  /// Vector to store all of the duplicate calls in a function and the calls
++  /// which get available due to inlining.
++  SmallVector<CallSiteLocation, 10> CallSiteLocs;
++
++  /// A set to store the code region types that will be tuned in current
++  /// autotuning flow. This will be populated with code region types based on
++  /// 'auto-tuning-type-filter' for -fautotune-generate and the types will be
++  /// extracted from config.yaml in case of -fautotune.
++  /// This set is used to apply type-based filtering prior to creating/
++  /// initializing a code region.
++  std::unordered_set<CodeRegionType> CodeRegionFilterTypes;
++
++  // A statically initialized map used to convert 'pass-name' to
++  // 'CodeRegionType'.
++  std::unordered_map<std::string, CodeRegionType> PTTMap;
++
++  /// A map of CodeRegion and ParameterManager to keep track of all the
++  /// parameters of code regions loaded from input config file.
++  LookUpTable ParamTable;
++  /// A list of CodeRegions as tuning opportunities
++  CodeRegions TuningOpps;
++  /// A ParameterManager for global parameters.
++  ParameterManager GlobalParams;
++
++  /// Apply filters for CodeRegions.
++  void applyOppFilters(CodeRegions &CRs);
++
++  /// Apply function name filter for CodeRegions.
++  bool applyFunctionFilter(std::string FuncName);
++
++  friend class Container;
++  friend class CodeRegion;
++  friend class AutoTuningRemarkManager;
++};
++
++extern class AutoTuningEngine Engine; // AutoTuning Engine
++
++} // end namespace autotuning
++
++#endif /* LLVM_AUTOTUNER_AUTOTUNING_H_ */
++#endif
+diff --git a/llvm/include/llvm/AutoTuner/AutoTuningRemarkManager.h b/llvm/include/llvm/AutoTuner/AutoTuningRemarkManager.h
+new file mode 100644
+index 000000000000..153a2c6246ad
+--- /dev/null
++++ b/llvm/include/llvm/AutoTuner/AutoTuningRemarkManager.h
+@@ -0,0 +1,43 @@
++#if defined(ENABLE_AUTOTUNER)
++//===- llvm/AutoTuner/AutoTuningRemarkManager.h - Remark Manager ----------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++//
++// This file declares the main interface for inputting and outputting
++// remarks for AutoTuning.
++//
++//===----------------------------------------------------------------------===//
++
++#ifndef LLVM_AUTOTUNINGREMARKMANAGER_H
++#define LLVM_AUTOTUNINGREMARKMANAGER_H
++
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/Remarks/RemarkStreamer.h"
++#include "llvm/Support/Error.h"
++#include <string>
++#include <unordered_map>
++#include <vector>
++
++namespace autotuning {
++class AutoTuningRemarkManager {
++public:
++  /// Read a list of parameters from input file.
++  /// Return true on success and false on failure.
++  static llvm::Error read(autotuning::AutoTuningEngine &E,
++                          const std::string &InputName,
++                          const std::string &RemarksFormat);
++
++  /// Dump a list of CodeRegions as tuning opportunities into a file.
++  /// Return true on success and false on failure.
++  static llvm::Error dump(const autotuning::AutoTuningEngine &E,
++                          const std::string &DirPath,
++                          const std::string &RemarksFormat,
++                          const std::string &RemarksPasses);
++};
++} // namespace autotuning
++#endif // LLVM_AUTOTUNINGREMARKMANAGER_H
++#endif
+diff --git a/llvm/include/llvm/AutoTuner/AutoTuningRemarkStreamer.h b/llvm/include/llvm/AutoTuner/AutoTuningRemarkStreamer.h
+new file mode 100644
+index 000000000000..0096139b12e9
+--- /dev/null
++++ b/llvm/include/llvm/AutoTuner/AutoTuningRemarkStreamer.h
+@@ -0,0 +1,47 @@
++#if defined(ENABLE_AUTOTUNER)
++// ===------------ llvm/AutoTuner/AutoTuningRemarkStreamer.h --------------===//
++//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++// Copyright (C) 2017-2022, Huawei Technologies Co., Ltd. All rights reserved.
++//
++// ===---------------------------------------------------------------------===//
++//
++// This file contains the implementation of the conversion between AutoTuner
++// CodeRegions and serializable remarks::Remark objects.
++//
++// ===---------------------------------------------------------------------===//
++
++#ifndef LLVM_AUTOTUNER_AUTOTUNINGREMARKSTREAMER_H
++#define LLVM_AUTOTUNER_AUTOTUNINGREMARKSTREAMER_H
++
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/Remarks/Remark.h"
++#include "llvm/Remarks/RemarkStreamer.h"
++#include "llvm/Support/Error.h"
++#include "llvm/Support/ToolOutputFile.h"
++#include <memory>
++#include <string>
++
++namespace llvm {
++/// Streamer for AutoTuner remarks which has logic for dealing with CodeRegions.
++class AutoTuningRemarkStreamer {
++  remarks::RemarkStreamer &RS;
++  /// Convert CodeRegion into remark objects.
++  remarks::Remark toRemark(const autotuning::CodeRegion &CR);
++
++public:
++  AutoTuningRemarkStreamer(remarks::RemarkStreamer &RS) : RS(RS) {}
++  /// Emit a CodeRegion through the streamer.
++  void emit(const autotuning::CodeRegion &CR);
++  /// Set a pass filter based on a regex \p Filter.
++  /// Returns an error if the regex is invalid.
++  Error setFilter(StringRef Filter);
++};
++} // end namespace llvm
++
++#endif // LLVM_AUTOTUNER_AUTOTUNINGREMARKSTREAMER_H
++#endif
+diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+index 52388692c196..95ac9acf4e5e 100644
+--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
++++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+@@ -27,6 +27,9 @@
+ #include <iterator>
+ #include <string>
+ #include <vector>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ 
+ namespace llvm {
+ 
+@@ -91,9 +94,19 @@ public:
+   void deleteNode(MachineInstr *MI);
+ };
+ 
++#if defined(ENABLE_AUTOTUNER)
++class MachineBasicBlock
++    : public ilist_node_with_parent<MachineBasicBlock, MachineFunction>,
++      public autotuning::Container {
++#else
+ class MachineBasicBlock
+     : public ilist_node_with_parent<MachineBasicBlock, MachineFunction> {
++#endif
+ public:
++#if defined(ENABLE_AUTOTUNER)
++  void initCodeRegion() override;
++  uint64_t computeStructuralHash() override;
++#endif
+   /// Pair of physical register and lane mask.
+   /// This is not simply a std::pair typedef because the members should be named
+   /// clearly as they both have an integer type.
+diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
+index 93cf0d27e9a7..c0db48ae1789 100644
+--- a/llvm/include/llvm/IR/Function.h
++++ b/llvm/include/llvm/IR/Function.h
+@@ -37,6 +37,9 @@
+ #include <cstdint>
+ #include <memory>
+ #include <string>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ 
+ namespace llvm {
+ 
+@@ -56,6 +59,24 @@ class User;
+ class BranchProbabilityInfo;
+ class BlockFrequencyInfo;
+ 
++#if defined(ENABLE_AUTOTUNER)
++class AutoTuningEnabledFunction : public autotuning::Container {
++public:
++  AutoTuningEnabledFunction() = delete;
++  void initCodeRegion() override;
++  void setHot() { this->Hotness = autotuning::Hot; }
++  void setCold() { this->Hotness = autotuning::Cold; }
++  autotuning::HotnessType getHotness() const { return this->Hotness; }
++  uint64_t computeStructuralHash() override;
++
++private:
++  AutoTuningEnabledFunction(Function *F) { Func = F; };
++  Function *Func;
++  autotuning::HotnessType Hotness = autotuning::Unknown;
++  friend class Function;
++};
++#endif
++
+ class LLVM_EXTERNAL_VISIBILITY Function : public GlobalObject,
+                                           public ilist_node<Function> {
+ public:
+@@ -68,6 +89,13 @@ public:
+   using arg_iterator = Argument *;
+   using const_arg_iterator = const Argument *;
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // There is one-to-one correspondence between ATEFunction and the current
++  // Function object to avoid messing up the LLVM User and owned Use classes'
++  // memory layout.
++  AutoTuningEnabledFunction ATEFunction = AutoTuningEnabledFunction(this);
++#endif
++
+ private:
+   // Important things that make up a function!
+   BasicBlockListType BasicBlocks;         ///< The basic blocks
+@@ -128,6 +156,11 @@ public:
+   void operator=(const Function&) = delete;
+   ~Function();
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // Return the auto-tuning enabled version of this Function object.
++  AutoTuningEnabledFunction &getATEFunction() { return ATEFunction; }
++#endif
++
+   // This is here to help easily convert from FunctionT * (Function * or
+   // MachineFunction *) in BlockFrequencyInfoImpl to Function * by calling
+   // FunctionT->getFunction().
+@@ -840,7 +873,11 @@ public:
+   /// AssemblyAnnotationWriter.
+   void print(raw_ostream &OS, AssemblyAnnotationWriter *AAW = nullptr,
+              bool ShouldPreserveUseListOrder = false,
++#if defined(ENABLE_AUTOTUNER)
++             bool IsForDebug = false, bool PrintCompleteIR = false) const;
++#else
+              bool IsForDebug = false) const;
++#endif
+ 
+   /// viewCFG - This function is meant for use from the debugger.  You can just
+   /// say 'call F->viewCFG()' and a ghostview window should pop up from the
+diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
+index 6095b0a1be69..dcc9bbee30fa 100644
+--- a/llvm/include/llvm/IR/InstrTypes.h
++++ b/llvm/include/llvm/IR/InstrTypes.h
+@@ -1169,6 +1169,23 @@ public:
+ using OperandBundleDef = OperandBundleDefT<Value *>;
+ using ConstOperandBundleDef = OperandBundleDefT<const Value *>;
+ 
++#if defined(ENABLE_AUTOTUNER)
++//===----------------------------------------------------------------------===//
++//                    AutoTuningEnabledCallSite Class
++//===----------------------------------------------------------------------===//
++class CallBase;
++class AutoTuningEnabledCallSite : public autotuning::Container {
++public:
++  AutoTuningEnabledCallSite() = delete;
++  void initCodeRegion() override;
++  uint64_t computeStructuralHash() override;
++  AutoTuningEnabledCallSite(CallBase *CallBase) { CB = CallBase; }
++
++private:
++  CallBase *CB;
++};
++#endif
++
+ //===----------------------------------------------------------------------===//
+ //                               CallBase Class
+ //===----------------------------------------------------------------------===//
+@@ -1229,6 +1246,13 @@ protected:
+   unsigned getNumSubclassExtraOperandsDynamic() const;
+ 
+ public:
++#if defined(ENABLE_AUTOTUNER)
++  // There is one-to-one correspondence between ATECallSite and CallBase class
++  // to enable auto-tuning.
++  std::unique_ptr<AutoTuningEnabledCallSite> ATECallSite =
++      std::make_unique<AutoTuningEnabledCallSite>(this);
++#endif
++
+   using Instruction::getContext;
+ 
+   /// Create a clone of \p CB with a different set of operand bundles and
+diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
+index 8d60384e1a32..9d638af6eeef 100644
+--- a/llvm/include/llvm/IR/Instructions.h
++++ b/llvm/include/llvm/IR/Instructions.h
+@@ -3287,6 +3287,23 @@ struct OperandTraits<BranchInst> : public VariadicOperandTraits<BranchInst, 1> {
+ 
+ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BranchInst, Value)
+ 
++#if defined(ENABLE_AUTOTUNER)
++//===----------------------------------------------------------------------===//
++//                    AutoTuningEnabledSwitchInst Class
++//===----------------------------------------------------------------------===//
++class SwitchInst;
++
++class AutoTuningEnabledSwitchInst : public autotuning::Container {
++public:
++  AutoTuningEnabledSwitchInst() = delete;
++  void initCodeRegion() override;
++  uint64_t computeStructuralHash() override;
++  AutoTuningEnabledSwitchInst(SwitchInst *SwitchInst) { SI = SwitchInst; }
++
++private:
++  SwitchInst *SI;
++};
++#endif
+ //===----------------------------------------------------------------------===//
+ //                               SwitchInst Class
+ //===----------------------------------------------------------------------===//
+@@ -3332,6 +3349,13 @@ protected:
+ public:
+   void operator delete(void *Ptr) { User::operator delete(Ptr); }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // There is one-to-one correspondence between ATESwitchInst and
++  // SwitchInst class to enable AutoTuner.
++  std::unique_ptr<AutoTuningEnabledSwitchInst> ATESwitchInst =
++      std::make_unique<AutoTuningEnabledSwitchInst>(this);
++#endif
++
+   // -2
+   static const unsigned DefaultPseudoIndex = static_cast<unsigned>(~0L-1);
+ 
+diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
+index 670a40b28eab..904a450a1888 100644
+--- a/llvm/include/llvm/IR/Module.h
++++ b/llvm/include/llvm/IR/Module.h
+@@ -38,6 +38,9 @@
+ #include <optional>
+ #include <string>
+ #include <vector>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ 
+ namespace llvm {
+ 
+diff --git a/llvm/include/llvm/IR/StructuralHash.h b/llvm/include/llvm/IR/StructuralHash.h
+index 1bdeb85afa3c..c0bcc8153eb8 100644
+--- a/llvm/include/llvm/IR/StructuralHash.h
++++ b/llvm/include/llvm/IR/StructuralHash.h
+@@ -15,6 +15,9 @@
+ #define LLVM_IR_STRUCTURALHASH_H
+ 
+ #include <cstdint>
++#if defined(ENABLE_AUTOTUNER)
++#include <vector>
++#endif
+ 
+ namespace llvm {
+ 
+@@ -24,6 +27,17 @@ class Module;
+ uint64_t StructuralHash(const Function &F);
+ uint64_t StructuralHash(const Module &M);
+ 
++#if defined(ENABLE_AUTOTUNER)
++class MachineBasicBlock;
++class BasicBlock;
++class CallBase;
++class SwitchInst;
++
++uint64_t StructuralHash(const std::vector<BasicBlock *> BBs);
++uint64_t StructuralHash(const MachineBasicBlock &MBB);
++uint64_t StructuralHash(const CallBase &CB);
++uint64_t StructuralHash(const SwitchInst &SI);
++#endif
+ } // end namespace llvm
+ 
+ #endif
+diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
+index c6fee47b464b..80bec2d82e24 100644
+--- a/llvm/include/llvm/InitializePasses.h
++++ b/llvm/include/llvm/InitializePasses.h
+@@ -340,6 +340,11 @@ void initializeWasmEHPreparePass(PassRegistry&);
+ void initializeWinEHPreparePass(PassRegistry&);
+ void initializeWriteBitcodePassPass(PassRegistry&);
+ void initializeXRayInstrumentationPass(PassRegistry&);
++#if defined(ENABLE_AUTOTUNER)
++void initializeAutotuningDumpLegacyPass(PassRegistry &);
++void initializeAutoTuningCompileFunctionLegacyPass(PassRegistry &);
++void initializeAutoTuningCompileModuleLegacyPass(PassRegistry &);
++#endif
+ 
+ } // end namespace llvm
+ 
+diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
+index 7420ea64e954..3a8ecb1399f1 100644
+--- a/llvm/include/llvm/LinkAllPasses.h
++++ b/llvm/include/llvm/LinkAllPasses.h
+@@ -54,6 +54,9 @@
+ #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+ #include "llvm/Transforms/Vectorize.h"
+ #include <cstdlib>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/Transforms/Scalar/AutoTuningCompile.h"
++#endif
+ 
+ namespace {
+   struct ForcePassLinking {
+@@ -93,6 +96,11 @@ namespace {
+       (void) llvm::createInstSimplifyLegacyPass();
+       (void) llvm::createInstructionCombiningPass();
+       (void) llvm::createJMCInstrumenterPass();
++#if defined(ENABLE_AUTOTUNER)
++      (void) llvm::createAutotuningDumpPass();
++      (void) llvm::createAutoTuningCompileFunctionLegacyPass();
++      (void) llvm::createAutoTuningCompileModuleLegacyPass();
++#endif
+       (void) llvm::createKCFIPass();
+       (void) llvm::createLCSSAPass();
+       (void) llvm::createLICMPass();
+diff --git a/llvm/include/llvm/Remarks/Remark.h b/llvm/include/llvm/Remarks/Remark.h
+index a66f7ed73f2f..3bcc0c710498 100644
+--- a/llvm/include/llvm/Remarks/Remark.h
++++ b/llvm/include/llvm/Remarks/Remark.h
+@@ -20,6 +20,10 @@
+ #include "llvm/Support/raw_ostream.h"
+ #include <optional>
+ #include <string>
++#if defined(ENABLE_AUTOTUNER)
++#include <map>
++#include <vector>
++#endif
+ 
+ namespace llvm {
+ namespace remarks {
+@@ -47,6 +51,9 @@ struct Argument {
+   StringRef Key;
+   // FIXME: We might want to be able to store other types than strings here.
+   StringRef Val;
++#if defined(ENABLE_AUTOTUNER)
++  std::optional<std::vector<StringRef>> VectorVal;
++#endif
+   // If set, the debug location corresponding to the value.
+   std::optional<RemarkLocation> Loc;
+ 
+@@ -65,6 +72,9 @@ enum class Type {
+   Analysis,
+   AnalysisFPCommute,
+   AnalysisAliasing,
++#if defined(ENABLE_AUTOTUNER)
++  AutoTuning,
++#endif
+   Failure,
+   First = Unknown,
+   Last = Failure
+@@ -105,6 +115,28 @@ struct Remark {
+   /// Mangled name of the function that triggers the emssion of this remark.
+   StringRef FunctionName;
+ 
++#if defined(ENABLE_AUTOTUNER)
++  /// Type of the code region that the remark is associated with.
++  std::optional<StringRef> CodeRegionType;
++
++  /// Configuration value for generating the same baseline binary associated
++  /// with this remark.
++  std::optional<std::map<std::string, std::string>> BaselineConfig;
++
++  /// Hash of the code region that the remark is associated with.
++  std::optional<uint64_t> CodeRegionHash;
++
++  /// Configs values passed to AutoTuner for dynamic setting of search space
++  /// for code regions.
++  std::optional<std::map<std::string, std::vector<unsigned int>>>
++      AutoTunerOptions;
++
++  /// Invocation/Registering of Optimization Pass in the compilation pipeline.
++  /// It is used to differentiate between different invocations of same
++  /// optimization pass.
++  std::optional<unsigned int> Invocation;
++#endif
++
+   /// The location in the source file of the remark.
+   std::optional<RemarkLocation> Loc;
+ 
+diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h
+index d2079fead668..c59dba2749f0 100644
+--- a/llvm/include/llvm/Support/CommandLine.h
++++ b/llvm/include/llvm/Support/CommandLine.h
+@@ -40,6 +40,9 @@
+ #include <type_traits>
+ #include <vector>
+ 
++#if defined(ENABLE_AUTOTUNER)
++#include <unordered_map>
++#endif
+ namespace llvm {
+ 
+ namespace vfs {
+@@ -72,6 +75,20 @@ bool ParseCommandLineOptions(int argc, const char *const *argv,
+                              const char *EnvVar = nullptr,
+                              bool LongOptionsUseDoubleDash = false);
+ 
++#if defined(ENABLE_AUTOTUNER)
++// It will parse AutoTuner options (LLVMParams & ProgramParams) and add them as
++// command line flags for the compilation process. These options are suggested
++// by AutoTuner during tuning flow. This function will always be called after
++// AutoTuner initialization.
++// Returns true on success. Otherwise, this will print the error message to
++// stderr and exit.
++bool ParseAutoTunerOptions(
++    std::unordered_map<std::string, std::string> LLVMParams,
++    std::unordered_map<std::string, std::string> ProgramParams,
++    StringRef Overview = "", raw_ostream *Errs = nullptr,
++    const char *EnvVar = nullptr, bool LongOptionsUseDoubleDash = false);
++#endif
++
+ // Function pointer type for printing version information.
+ using VersionPrinterTy = std::function<void(raw_ostream &)>;
+ 
+diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
+index aaba710cfde6..e69beeade947 100644
+--- a/llvm/include/llvm/Transforms/Scalar.h
++++ b/llvm/include/llvm/Transforms/Scalar.h
+@@ -16,6 +16,10 @@
+ 
+ #include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
+ #include <functional>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/Pass.h"
++#include <string>
++#endif
+ 
+ namespace llvm {
+ 
+@@ -299,6 +303,19 @@ Pass *createLoopSimplifyCFGPass();
+ //
+ FunctionPass *createInstSimplifyLegacyPass();
+ 
++#if defined(ENABLE_AUTOTUNER)
++//===--------------------------------------------------------------------===//
++//
++// createAutotuningCompilePass - It writes IR files with -fautotune-generate
++// for autotuning flow. It also enables/disables the execution of optimization
++// passes in subsequent compilations (with -fautotune) based on autotuning
++// methodology and available opportunities.
++//
++FunctionPass *
++createAutoTuningCompileFunctionLegacyPass(std::string Pass = "unknown");
++ModulePass *
++createAutoTuningCompileModuleLegacyPass(std::string Pass = "unknown");
++#endif
+ 
+ //===----------------------------------------------------------------------===//
+ //
+diff --git a/llvm/include/llvm/Transforms/Scalar/AutoTuningCompile.h b/llvm/include/llvm/Transforms/Scalar/AutoTuningCompile.h
+new file mode 100644
+index 000000000000..2cbb48f336ef
+--- /dev/null
++++ b/llvm/include/llvm/Transforms/Scalar/AutoTuningCompile.h
+@@ -0,0 +1,170 @@
++#if defined(ENABLE_AUTOTUNER)
++//===---------------- AutoTuningCompile.h - Auto-Tuning -------------------===//
++//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++// Copyright (C) 2017-2022, Huawei Technologies Co., Ltd. All rights reserved.
++//
++//===----------------------------------------------------------------------===//
++//
++/// \file
++/// This file declares the interface for AutoTuning Incremental Compilation.
++/// Incremental compilation requires two passes 1) Module Pass and 2) Function
++/// Pass for legacy pass manager. It requires an additional Loop Pass for new
++/// pass manager.
++/// AutoTuningOptPassGate class is also defined here which is used to enable/
++/// disable the execution of optimization passes for the compilation pipeline.
++//
++//===----------------------------------------------------------------------===//
++
++#ifndef LLVM_AUTOTUNER_AUTOTUNING_COMPILE_H_
++#define LLVM_AUTOTUNER_AUTOTUNING_COMPILE_H_
++
++#include "llvm/Analysis/LoopAnalysisManager.h"
++#include "llvm/Analysis/LoopInfo.h"
++#include "llvm/Analysis/LoopPass.h"
++#include "llvm/IR/OptBisect.h"
++#include "llvm/IR/PassManager.h"
++#include "llvm/Pass.h"
++#include "llvm/Transforms/Scalar/LoopPassManager.h"
++
++namespace llvm {
++
++class Pass;
++
++//  Skips or runs optimization passes.
++class AutoTuningOptPassGate : public OptPassGate {
++public:
++  explicit AutoTuningOptPassGate(bool Skip = false) : Skip(Skip) {}
++
++  bool shouldRunPass(const StringRef PassName,
++                     StringRef IRDescription) override;
++  bool isEnabled() const override { return true; }
++  bool checkPass(const StringRef PassName, const StringRef TargetDesc);
++  void setSkip(bool Skip) { this->Skip = Skip; }
++  bool getSkip() const { return Skip; }
++
++private:
++  bool Skip;
++};
++
++// Returns a static AutoTuningOptPassGate object which will be used to register
++// CallBack for OptBisect instrumentation.
++// It will also be used by AutoTuningCompile passes to enable/disable
++// optimization passes.
++AutoTuningOptPassGate &getAutoTuningOptPassGate();
++
++class AutoTuningCompileModule {
++public:
++  explicit AutoTuningCompileModule(std::string Pass = "unknown");
++  bool run(Module &M);
++  // Write IR files for each module to be re-used in subsequent compilations
++  // for autotuning cycles. It only works with -fautotune-generate.
++  void writeIRFiles(Module &M) const;
++  // Enable/Disable execution of optimization passes in subsequent compilations
++  // based on autotuning methodology and available opportunities. It Only works
++  // with -fautotune
++  bool modifyCompilationPipeline(Module &M) const;
++
++  static void setSkipCompilation(bool Option) { SkipCompilation = Option; }
++  static bool getSkipCompilation() { return SkipCompilation; }
++
++private:
++  static bool SkipCompilation;
++  std::string Pass = "";
++};
++
++class AutoTuningCompileModuleLegacy : public ModulePass {
++public:
++  static char ID;
++  explicit AutoTuningCompileModuleLegacy(std::string Pass = "unknown");
++  bool runOnModule(Module &M) override;
++  StringRef getPassName() const override;
++  void getAnalysisUsage(AnalysisUsage &AU) const override {
++    AU.setPreservesAll();
++  }
++
++private:
++  std::string Pass = "";
++};
++
++class AutoTuningCompileModulePass
++    : public PassInfoMixin<AutoTuningCompileModulePass> {
++public:
++  explicit AutoTuningCompileModulePass(std::string Pass = "unknown")
++      : Pass(Pass){};
++  PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
++
++private:
++  std::string Pass = "";
++};
++
++class AutoTuningCompileFunction {
++public:
++  explicit AutoTuningCompileFunction(std::string Pass = "unknown");
++  bool run(Function &F);
++  // Write IR files for each module to be re-used in subsequent compilations
++  // for autotuning cycles. It only works with -fautotune-generate.
++  void writeIRFiles(Module &M);
++  // Enable/Disable execution of optimization passes in subsequent compilations
++  // based on autotuning methodology and available opportunities. It Only works
++  // with -fautotune
++  bool modifyCompilationPipeline(Function &F);
++
++private:
++  // A module may have multiple functions; decision to enable/disable
++  // execution of an optimization pass will be made for the first function and
++  // will be used for all of the functions in the module.
++  // 'SkipDecision' will be set once the decision is made for a specific 'Pass'.
++  bool SkipDecision = false;
++
++  // A module may have multiple functions; IR file will be written once for the
++  // entire module for a specific 'Pass'.
++  bool IsModuleWritten = false;
++  std::string Pass = "";
++};
++
++class AutoTuningCompileFunctionLegacy : public FunctionPass {
++public:
++  static char ID;
++  explicit AutoTuningCompileFunctionLegacy(std::string Pass = "unknown");
++  bool runOnFunction(Function &F) override;
++  StringRef getPassName() const override;
++  void getAnalysisUsage(AnalysisUsage &AU) const override {
++    AU.setPreservesAll();
++  }
++
++private:
++  std::string Pass = "";
++};
++
++class AutoTuningCompileFunctionPass
++    : public PassInfoMixin<AutoTuningCompileFunctionPass> {
++public:
++  explicit AutoTuningCompileFunctionPass(std::string Pass = "unknown")
++      : Pass(Pass){};
++  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
++
++private:
++  std::string Pass = "";
++};
++
++class AutoTuningCompileLoopPass
++    : public PassInfoMixin<AutoTuningCompileLoopPass> {
++public:
++  explicit AutoTuningCompileLoopPass(std::string Pass = "unknown")
++      : Pass(Pass){};
++  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
++                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
++
++private:
++  std::string Pass = "";
++};
++
++} // end namespace llvm
++
++#endif /* LLVM_AUTOTUNER_AUTOTUNING_COMPILE_H_ */
++#endif
+diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+index 4f3010965b59..e1cccf417898 100644
+--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
++++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+@@ -108,7 +108,11 @@ bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
+                         unsigned TripMultiple, unsigned LoopSize,
+                         TargetTransformInfo::UnrollingPreferences &UP,
+                         TargetTransformInfo::PeelingPreferences &PP,
++#if defined(ENABLE_AUTOTUNER)
++                        bool &UseUpperBound, unsigned int Invocation = 0);
++#else
+                         bool &UseUpperBound);
++#endif
+ 
+ void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
+                              ScalarEvolution *SE, DominatorTree *DT,
+diff --git a/llvm/lib/Analysis/AutotuningDump.cpp b/llvm/lib/Analysis/AutotuningDump.cpp
+new file mode 100644
+index 000000000000..81b2bbead70e
+--- /dev/null
++++ b/llvm/lib/Analysis/AutotuningDump.cpp
+@@ -0,0 +1,265 @@
++#if defined(ENABLE_AUTOTUNER)
++// ===-- AutotuningDump.cpp - Auto-Tuning---------------------------------===//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++// ===--------------------------------------------------------------------===//
++//
++// This file contains pass collecting IR of tuned regions and storing them into
++// predetrmined locations, to be used later by autotuning ML guidance
++//
++// ===--------------------------------------------------------------------===//
++#include "llvm/Analysis/AutotuningDump.h"
++#include "llvm/Analysis/Passes.h"
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/IR/LegacyPassManager.h"
++#include "llvm/InitializePasses.h"
++#include "llvm/Pass.h"
++#include "llvm/Support/CommandLine.h"
++#include "llvm/Support/Path.h"
++#include "llvm/Support/Process.h"
++#include "llvm/Support/raw_ostream.h"
++#include <sys/stat.h>
++
++using namespace llvm;
++
++#define DEBUG_TYPE "autotuning-dump"
++
++enum AutotuningDumpOpt { whole_modules, functions, loops };
++
++// Enable Debug Options to be specified on the command line
++cl::opt<AutotuningDumpOpt> AutotuningDumpMode(
++    "autotuning-dump-mode", cl::desc("Choose autotuning dump mode:"),
++    cl::init(whole_modules),
++    cl::values(clEnumVal(whole_modules, "dump each module in its own file"),
++               clEnumVal(functions, "dump each function in its own file"),
++               clEnumVal(loops, "dump each loop in its own file")));
++
++AutotuningDump::AutotuningDump(bool IncrementalCompilation) {
++  // Check if the environment variable AUTOTUNE_DATADIR is set.
++  IsIncrementalCompilation = IncrementalCompilation;
++  AutoTuneDirPath = "autotune_datadir";
++  if (std::optional<std::string> MaybePath =
++          llvm::sys::Process::GetEnv("AUTOTUNE_DATADIR"))
++    AutoTuneDirPath = *MaybePath;
++}
++
++int AutotuningDump::getConfigNumber() {
++  auto ConfigNumOrErr = autotuning::Engine.getConfigNumber();
++  if (ConfigNumOrErr)
++    return *ConfigNumOrErr;
++  else {
++    report_fatal_error("Invalid/missing Autotuner configuration ID");
++    return -1;
++  }
++}
++
++void AutotuningDump::dumpToStream(llvm::raw_ostream &os, const Loop &L) const {
++  L.print(os);
++}
++
++void AutotuningDump::dumpToStream(llvm::raw_ostream &os,
++                                  const Function &F) const {
++  F.print(os, /*AAW*/ nullptr, /*ShouldPreserveUseListOrder*/ false,
++          /*IsForDebug*/ false, /*PrintCompleteIR*/ true);
++}
++
++// Create appropriate file. File will contains AbsolutePath/FileName.
++std::unique_ptr<raw_ostream> AutotuningDump::createFile(const Twine &File) {
++  std::error_code EC;
++  return std::make_unique<raw_fd_ostream>((File).str(), EC,
++                                          sys::fs::CD_CreateAlways,
++                                          sys::fs::FA_Write, sys::fs::OF_None);
++}
++
++std::string AutotuningDump::getDirectoryName(const std::string File) const {
++  std::string DirectoryName = AutoTuneDirPath;
++  if (!autotuning::Engine.isMLEnabled())
++    DirectoryName += "/IR_files";
++
++  DirectoryName = DirectoryName + "/" + File + "/";
++
++  // Create directory if not already present.
++  if (std::error_code EC = sys::fs::create_directories(DirectoryName))
++    errs() << "could not create directory: " << DirectoryName << ": "
++           << EC.message();
++
++  return DirectoryName;
++}
++
++std::string AutotuningDump::getFileName(std::string FilePath) {
++  if (autotuning::Engine.isMLEnabled())
++    return std::to_string(this->getConfigNumber()) + ".ll";
++  std::replace(FilePath.begin(), FilePath.end(), '/', '_');
++  return FilePath + ".ll";
++}
++
++void AutotuningDump::dumpModule(Module &M) {
++  std::unique_ptr<raw_ostream> fptr;
++  LLVM_DEBUG(dbgs() << "AutotuningDump: Dump module IR files.\n");
++  if (IsIncrementalCompilation) {
++    std::string Filename = M.getSourceFileName();
++    llvm::SmallString<128> FilenameVec = StringRef(Filename);
++    llvm::sys::fs::make_absolute(FilenameVec);
++    size_t Pos = FilenameVec.rfind(".");
++    if (Pos != std::string::npos) {
++      FilenameVec.pop_back_n(FilenameVec.size() - Pos);
++      FilenameVec.append(".ll");
++    }
++    fptr = createFile(FilenameVec);
++  } else {
++    std::string File = llvm::sys::path::filename(M.getName()).str();
++    std::string DirectoryName = getDirectoryName(File);
++    std::string FileName = getFileName(M.getName().str());
++    fptr = createFile(DirectoryName + FileName);
++  }
++
++  M.print(*fptr, nullptr, true, false);
++}
++
++void AutotuningDump::dumpFunctions(Module &M) {
++  std::string FilePath = M.getName().str();
++  std::replace(FilePath.begin(), FilePath.end(), '/', '_');
++  std::string DirectoryName = getDirectoryName(FilePath);
++  for (Function &F : M.getFunctionList()) { // go through all functions
++    if (F.isDeclaration() || F.empty())
++      continue;
++
++    AutoTuningEnabledFunction *AutotuneFunc = &F.getATEFunction();
++    assert(AutotuneFunc);
++    autotuning::Engine.initContainer(AutotuneFunc, "autotuning-dump",
++                                     F.getName(), false);
++    std::string FuncName = F.getName().str();
++    // check the whole function
++    if (AutotuneFunc->requiresIRDump(true)) {
++      auto fptr = createFile(DirectoryName + Twine(FuncName) + ".ll");
++      this->dumpToStream(*fptr, F);
++    }
++  }
++}
++
++void AutotuningDump::dumpLoops(Module &M,
++                               function_ref<LoopInfo &(Function &)> GetLI) {
++  for (Function &F : M) {
++    // Nothing to do for declarations.
++    if (F.isDeclaration() || F.empty())
++      continue;
++
++    LoopInfo &LI = GetLI(F);
++    for (auto &L : LI.getLoopsInPreorder()) {
++      Function *Func = nullptr;
++      StringRef FuncName = "";
++      if (!L->isInvalid())
++        Func = L->getHeader()->getParent();
++      if (Func)
++        FuncName = Func->getName();
++
++      autotuning::Engine.initContainer(L, "autotuning-dump", FuncName, false);
++      if (L->requiresIRDump()) {
++        std::string FuncName = L->getCodeRegion().getFuncName();
++        unsigned SourceLine = L->getCodeRegion().getSourceLoc().SourceLine;
++        std::string DirectoryName = AutoTuneDirPath + "/" +
++                                    llvm::sys::path::filename(FuncName).str() +
++                                    "_loop_" + std::to_string(SourceLine);
++        std::string FileName = std::to_string(this->getConfigNumber()) + ".ll";
++        auto fptr = createFile(DirectoryName + "/" + FileName);
++        this->dumpToStream(*fptr, *L);
++      }
++    }
++  }
++}
++
++bool AutotuningDump::run(Module &M,
++                         function_ref<LoopInfo &(Function &)> GetLI) {
++  // Change to absolute path.
++  SmallString<256> OutputPath = StringRef(AutoTuneDirPath);
++  sys::fs::make_absolute(OutputPath);
++
++  // Creating new output directory, if it does not exists.
++  if (std::error_code EC = sys::fs::create_directories(OutputPath)) {
++    llvm::errs() << (make_error<StringError>(
++        "could not create directory: " + Twine(OutputPath) + ": " +
++            EC.message(),
++        EC));
++    return false;
++  }
++
++  if (IsIncrementalCompilation) {
++    LLVM_DEBUG(
++        dbgs()
++        << "AutotuningDump: IR files writing for incremental compilation.\n");
++    dumpModule(M);
++    return false;
++  }
++
++  switch (AutotuningDumpMode) {
++  case whole_modules:
++    dumpModule(M);
++    break;
++  case functions:
++    dumpFunctions(M);
++    break;
++  case loops:
++    dumpLoops(M, GetLI);
++  }
++
++  return false;
++}
++
++AutotuningDumpLegacy::AutotuningDumpLegacy(bool IncrementalCompilation)
++    : ModulePass(AutotuningDumpLegacy::ID) {
++  IsIncrementalCompilation = IncrementalCompilation;
++  initializeAutotuningDumpLegacyPass(*PassRegistry::getPassRegistry());
++}
++
++bool AutotuningDumpLegacy::runOnModule(Module &M) {
++  if (!autotuning::Engine.isDumpEnabled())
++    return false;
++
++  auto GetLI = [this](Function &F) -> LoopInfo & {
++    return getAnalysis<LoopInfoWrapperPass>(F).getLoopInfo();
++  };
++
++  AutotuningDump Impl(IsIncrementalCompilation);
++  return Impl.run(M, GetLI);
++}
++
++StringRef AutotuningDumpLegacy::getPassName() const {
++  return "Autotuning Dump";
++}
++
++void AutotuningDumpLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
++  AU.setPreservesAll();
++  AU.addRequired<LoopInfoWrapperPass>();
++}
++
++char AutotuningDumpLegacy::ID = 0;
++INITIALIZE_PASS_BEGIN(AutotuningDumpLegacy, "autotuning-dump",
++                      "Dump IR for Autotuned Code Regions", false, false)
++INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
++INITIALIZE_PASS_END(AutotuningDumpLegacy, "autotuning-dump",
++                    "Dump IR for Autotuned Code Regions", false, false)
++
++ModulePass *llvm::createAutotuningDumpPass() {
++  return new AutotuningDumpLegacy();
++}
++
++AnalysisKey AutotuningDumpAnalysis::Key;
++
++AutotuningDumpAnalysis::Result
++AutotuningDumpAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
++  if (!autotuning::Engine.isDumpEnabled())
++    return false;
++
++  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
++  auto GetLI = [&FAM](Function &F) -> LoopInfo & {
++    return FAM.getResult<LoopAnalysis>(F);
++  };
++
++  AutotuningDump Impl(IsIncrementalCompilation);
++  Impl.run(M, GetLI);
++  return false;
++}
++#endif
+diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
+index 4a1797c42789..9c6a70f0221f 100644
+--- a/llvm/lib/Analysis/CMakeLists.txt
++++ b/llvm/lib/Analysis/CMakeLists.txt
+@@ -30,6 +30,7 @@ add_llvm_component_library(LLVMAnalysis
+   Analysis.cpp
+   AssumeBundleQueries.cpp
+   AssumptionCache.cpp
++  AutotuningDump.cpp
+   BasicAliasAnalysis.cpp
+   BlockFrequencyInfo.cpp
+   BlockFrequencyInfoImpl.cpp
+@@ -153,6 +154,7 @@ add_llvm_component_library(LLVMAnalysis
+   ${MLLinkDeps}
+ 
+   LINK_COMPONENTS
++  AutoTuner
+   BinaryFormat
+   Core
+   Object
+diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp
+index e2480d51d372..f6b3c14a0345 100644
+--- a/llvm/lib/Analysis/InlineAdvisor.cpp
++++ b/llvm/lib/Analysis/InlineAdvisor.cpp
+@@ -383,15 +383,27 @@ llvm::shouldInline(CallBase &CB,
+   Function *Callee = CB.getCalledFunction();
+   Function *Caller = CB.getCaller();
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // Get the code Region to add BaselineConfig values for inline
++  const autotuning::CodeRegion &CR = CB.ATECallSite.get()->getCodeRegion();
++  static const std::string ForceInlineParamStr = "ForceInline";
++#endif
++
+   if (IC.isAlways()) {
+     LLVM_DEBUG(dbgs() << "    Inlining " << inlineCostStr(IC)
+                       << ", Call: " << CB << "\n");
++#if defined(ENABLE_AUTOTUNER)
++    autotuning::Engine.addOpportunity(CR, {{ForceInlineParamStr, "1"}});
++#endif
+     return IC;
+   }
+ 
+   if (!IC) {
+     LLVM_DEBUG(dbgs() << "    NOT Inlining " << inlineCostStr(IC)
+                       << ", Call: " << CB << "\n");
++#if defined(ENABLE_AUTOTUNER)
++    autotuning::Engine.addOpportunity(CR, {{ForceInlineParamStr, "0"}});
++#endif
+     if (IC.isNever()) {
+       ORE.emit([&]() {
+         return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
+@@ -417,6 +429,9 @@ llvm::shouldInline(CallBase &CB,
+     LLVM_DEBUG(dbgs() << "    NOT Inlining: " << CB
+                       << " Cost = " << IC.getCost()
+                       << ", outer Cost = " << TotalSecondaryCost << '\n');
++#if defined(ENABLE_AUTOTUNER)
++    autotuning::Engine.addOpportunity(CR, {{ForceInlineParamStr, "0"}});
++#endif
+     ORE.emit([&]() {
+       return OptimizationRemarkMissed(DEBUG_TYPE, "IncreaseCostInOtherContexts",
+                                       Call)
+@@ -430,6 +445,9 @@ llvm::shouldInline(CallBase &CB,
+ 
+   LLVM_DEBUG(dbgs() << "    Inlining " << inlineCostStr(IC) << ", Call: " << CB
+                     << '\n');
++#if defined(ENABLE_AUTOTUNER)
++  autotuning::Engine.addOpportunity(CR, {{ForceInlineParamStr, "1"}});
++#endif
+   return IC;
+ }
+ 
+diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
+index a2f46edcf5ef..9f8f57865de2 100644
+--- a/llvm/lib/Analysis/InlineCost.cpp
++++ b/llvm/lib/Analysis/InlineCost.cpp
+@@ -162,6 +162,14 @@ static cl::opt<bool> DisableGEPConstOperand(
+     "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
+     cl::desc("Disables evaluation of GetElementPtr with constant operands"));
+ 
++#if defined(ENABLE_AUTOTUNER)
++static cl::opt<bool>
++    EnableLocalCallSiteTuning("auto-tuning-enable-local-callsite-tuning",
++                              cl::init(false), cl::Hidden,
++                              cl::desc("Enable AutoTuning for local callsites "
++                                       "as well."));
++#endif
++
+ namespace llvm {
+ std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
+   if (Attr.isValid()) {
+@@ -2990,6 +2998,27 @@ InlineCost llvm::getInlineCost(
+     return llvm::InlineCost::getNever(UserDecision->getFailureReason());
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (autotuning::Engine.isEnabled() && Call.getCaller() &&
++      (!Callee->hasLocalLinkage() || EnableLocalCallSiteTuning)) {
++    bool ForceInline = false;
++    bool Found = false;
++
++    autotuning::Engine.initContainer(Call.ATECallSite.get(), "inline",
++                                     Call.getCaller()->getName(),
++                                     /* addOpportunity */ false);
++
++    Found = Call.ATECallSite->lookUpParams<bool>("ForceInline", ForceInline);
++
++    if (Found) {
++      if (ForceInline)
++        return llvm::InlineCost::getAlways("Force inlined by auto-tuning");
++      else
++        return llvm::InlineCost::getNever("Force non-inlined by auto-tuning");
++    }
++  }
++#endif
++
+   LLVM_DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
+                           << "... (caller:" << Call.getCaller()->getName()
+                           << ")\n");
+diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
+index 60a72079e864..36aca73ee675 100644
+--- a/llvm/lib/Analysis/LoopInfo.cpp
++++ b/llvm/lib/Analysis/LoopInfo.cpp
+@@ -37,6 +37,10 @@
+ #include "llvm/Support/CommandLine.h"
+ #include "llvm/Support/GenericLoopInfoImpl.h"
+ #include "llvm/Support/raw_ostream.h"
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/IR/StructuralHash.h"
++#endif
+ using namespace llvm;
+ 
+ // Explicitly instantiate methods in LoopInfoImpl.h for IR-level Loops.
+@@ -663,6 +667,54 @@ Loop::LocRange Loop::getLocRange() const {
+   return LocRange();
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++uint64_t Loop::computeStructuralHash() {
++  std::vector<BasicBlock *> BBs = getBlocks();
++  return StructuralHash(BBs);
++}
++
++void Loop::initCodeRegion() {
++  std::string LoopName;
++  // use the header's name as the loop name
++  if (BasicBlock *Header = getHeader()) {
++    if (Header->hasName()) {
++      LoopName = Header->getName().str();
++    }
++    // if the header doesn't have a name,
++    // use the label of this header from AsmWriter
++    else {
++      std::string Str;
++      llvm::raw_string_ostream RSO(Str);
++      Header->printAsOperand(RSO);
++      LoopName = RSO.str();
++    }
++  } else {
++    LoopName = "<unnamed loop>";
++  }
++
++  Function *F = this->getHeader()->getParent();
++  StringRef FuncName = F->getName();
++
++  // init the CodeRegion
++  autotuning::CodeRegion CR = autotuning::CodeRegion(
++      LoopName, FuncName.data(), autotuning::CodeRegionType::Loop,
++      this->getStartLoc());
++  // Compute the number of non-debug IR instructions in this loop.
++  unsigned TotalNumInstrs = 0;
++  for (const BasicBlock *BB : this->getBlocks()) {
++    unsigned NumInstrs = std::distance(BB->instructionsWithoutDebug().begin(),
++                                       BB->instructionsWithoutDebug().end());
++    TotalNumInstrs += NumInstrs;
++  }
++  CR.setSize(TotalNumInstrs);
++  // Compute hotness.
++  autotuning::HotnessType Hotness = F->ATEFunction.getHotness();
++  CR.setHotness(Hotness);
++
++  this->setCodeRegion(CR);
++}
++#endif
++
+ #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ LLVM_DUMP_METHOD void Loop::dump() const { print(dbgs()); }
+ 
+diff --git a/llvm/lib/AutoTuner/AutoTuning.cpp b/llvm/lib/AutoTuner/AutoTuning.cpp
+new file mode 100644
+index 000000000000..1f09f06d84a2
+--- /dev/null
++++ b/llvm/lib/AutoTuner/AutoTuning.cpp
+@@ -0,0 +1,705 @@
++#if defined(ENABLE_AUTOTUNER)
++//===-- AutoTuning.cpp - Auto-Tuning --------------------------------------===//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++//===----------------------------------------------------------------------===//
++//
++// This file defines Auto Tuning related functions, models and interfaces.
++//
++//===----------------------------------------------------------------------===//
++
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/ADT/STLExtras.h"
++#include "llvm/ADT/StringRef.h"
++#include "llvm/AutoTuner/AutoTuningRemarkManager.h"
++#include "llvm/Support/CommandLine.h"
++#include "llvm/Support/Error.h"
++#include "llvm/Support/Process.h"
++
++// Enable debug messages for AutoTuning.
++#define DEBUG_TYPE "autotuning"
++
++using namespace llvm;
++
++// defined in 'lib/Remarks/YAMLRemarkParser.cpp'.
++extern cl::opt<bool> OmitAutotuningMetadata;
++
++// -auto-tuning-input - Command line option to specify the input file.
++static cl::opt<std::string> InputFile("auto-tuning-input", cl::Hidden,
++                                      cl::desc("Specify the input file"));
++
++// -auto-tuning-opp - Command line option to specify the output directory of
++// tuning opportunities.
++static cl::opt<std::string> OutputOppDir(
++    "auto-tuning-opp", cl::Hidden,
++    cl::desc("Specify the output directory of tuning opportunities"));
++
++static cl::opt<std::string>
++    RemarksPasses("auto-tuning-pass-filter", cl::Hidden,
++                  cl::desc("Only dump auto-tuning remarks from passes whose "
++                           "names match the given regular expression"),
++                  cl::value_desc("regex"));
++
++static cl::opt<std::string>
++    ProjectDir("autotuning-project-dir", cl::Hidden, cl::init(""),
++               cl::desc("Specify project base dir to make code region name "
++                        "relative to base dir. This operation will only be "
++                        "applied for coarse-grain code regions."));
++
++// -auto-tuning-config-id - Command line option to specify the config number
++// being used for compilation. Required only for ML guidance feature.
++static cl::opt<int> CFGNumber(
++    "auto-tuning-config-id", cl::Hidden,
++    cl::desc(
++        "Specify the auto-tuning configuration ID used in this compilation."));
++
++static cl::opt<std::string> OutputFormat(
++    "auto-tuning-remark-format", cl::Hidden,
++    cl::desc("The format used for auto-tuning remarks (default: YAML)"),
++    cl::value_desc("format"), cl::init("yaml"));
++
++// AutoTuner incremental compilation options.
++cl::opt<AutoTuningCompileOpt> AutoTuningCompileMode(
++    "auto-tuning-compile-mode", cl::Hidden, cl::init(Inactive),
++    cl::desc("AutoTuner: Choose incremental compilation mode."),
++    cl::values(clEnumVal(Inactive,
++                         "AutoTuner: Disable incremental compilation."),
++               clEnumVal(CoarseGrain, "AutoTuner: Enable incremental "
++                                      "compilation for coarse grain tuning."),
++               clEnumVal(FineGrain, "AutoTuner: Enable incremental compilation "
++                                    "for fine grain tuning."),
++               clEnumVal(Basic, "AutoTuner: Enable incremental compilation for "
++                                "any kind of code region.")));
++
++static cl::opt<bool>
++    EnableAutoTuningDump("enable-autotuning-dump", cl::Hidden, cl::init(false),
++                         cl::desc("Enable AutoTuningDump Pass"));
++
++static cl::opt<bool>
++    ThinLTOTuning("autotuning-thin-lto", cl::Hidden, cl::init(false),
++                  cl::desc("AutoTuner enabled in ThinLTO mode."));
++
++namespace autotuning {
++
++static cl::list<CodeRegionType> AutotuningOutputFilter(
++    "auto-tuning-type-filter", cl::Hidden, cl::CommaSeparated,
++    cl::desc(
++        "Select types of code regions to dump auto-tuning opportunities for:"),
++    cl::values(clEnumVal(LLVMParam, "LLVMParam code regions only"),
++               clEnumVal(ProgramParam, "ProgramParam code regions only"),
++               clEnumVal(CallSite, "CallSite code regions only"),
++               clEnumVal(Function, "Function code regions only"),
++               clEnumVal(Loop, "Loop code regions only"),
++               clEnumVal(MachineBasicBlock,
++                         "Machine basic block code regions only"),
++               clEnumVal(Switch, "Switch code regions only"),
++               clEnumVal(Other, "All other types of code regions")));
++
++static cl::list<std::string> AutotuningFunctionFilter(
++    "auto-tuning-function-filter", cl::Hidden, cl::CommaSeparated,
++    cl::desc("Apply code region filtering based on function names"));
++
++static const cl::opt<bool> ExcludeColdCodeRegion(
++    "auto-tuning-exclude-cold", cl::Hidden, cl::init(true),
++    cl::desc("Use profile data to prune cold code regions from auto-tuning"));
++
++static const cl::opt<bool> CodeRegionMatchingWithHash(
++    "auto-tuning-code-region-matching-hash", cl::Hidden, cl::init(true),
++    cl::desc("Use IR hashing to match the Code Regions"));
++
++static const cl::opt<bool> HotCodeRegionOnly(
++    "auto-tuning-hot-only", cl::Hidden, cl::init(false),
++    cl::desc(
++        "Use profile data to include hot code regions only from auto-tuning"));
++
++static const cl::opt<unsigned>
++    SizeThreshold("auto-tuning-size-threshold", cl::Hidden, cl::init(0),
++                  cl::desc("Prune small code regions from auto-tuning with a "
++                           "size smaller than the threshold"));
++
++static inline const std::string generateName(const std::string &Name) {
++  if (Name.empty())
++    return "unnamed";
++  else
++    return Name;
++}
++
++//===----------------------------------------------------------------------===//
++// CodeRegion implementation
++CodeRegion::CodeRegion(const CodeRegionType Type) : Type(Type) {}
++
++CodeRegion::CodeRegion(const std::string &Name, const std::string &FuncName,
++                       const CodeRegionType &Type, const DebugLoc &DL,
++                       const DynamicOptions DO) {
++  this->Name = generateName(Name);
++  this->FuncName = generateName(FuncName);
++  this->Type = Type;
++  this->StringType = getTypeAsString(Type);
++  if (DL) {
++    StringRef File = DL->getFilename();
++    unsigned Line = DL->getLine();
++    unsigned Col = DL->getColumn();
++    this->Location = SourceLocation{File.str(), Line, Col};
++  }
++  this->AutoTunerOptions = DO;
++}
++
++CodeRegion::CodeRegion(const std::string &Name, const std::string &FuncName,
++                       const CodeRegionType &Type,
++                       const SourceLocation &Location,
++                       const DynamicOptions DO) {
++  this->Name = generateName(Name);
++  this->FuncName = generateName(FuncName);
++  this->Type = Type;
++  this->StringType = getTypeAsString(Type);
++  this->Location = Location;
++  this->AutoTunerOptions = DO;
++}
++
++CodeRegion::CodeRegion(const std::string &Name, const std::string &FuncName,
++                       const std::string &PassName, const CodeRegionType &Type,
++                       const SourceLocation &Location,
++                       const unsigned int Invocation)
++    : CodeRegion(Name, FuncName, Type, Location) {
++  this->PassName = generateName(PassName);
++  this->Invocation = Invocation;
++}
++
++bool CodeRegion::operator==(const CodeRegion &CodeRegion) const {
++  bool IsEqual = false;
++  if (OmitAutotuningMetadata)
++    IsEqual = (this->getHash() == CodeRegion.getHash()) &&
++              (this->Type == CodeRegion.getType()) &&
++              (this->PassName == CodeRegion.getPassName());
++  else {
++    IsEqual = (this->Type == CodeRegion.getType()) &&
++              (this->Name == CodeRegion.getName()) &&
++              (this->PassName == CodeRegion.getPassName()) &&
++              (this->FuncName == CodeRegion.getFuncName()) &&
++              (this->Location == CodeRegion.getSourceLoc());
++    if (CodeRegionMatchingWithHash)
++      IsEqual = IsEqual && (this->getHash() == CodeRegion.getHash());
++  }
++
++  if (autotuning::Engine.ParseInput)
++    IsEqual = IsEqual && this->getInvocation() == CodeRegion.getInvocation();
++
++  if (autotuning::Engine.GenerateOutput)
++    IsEqual =
++        IsEqual && this->getBaselineConfig() == CodeRegion.getBaselineConfig();
++
++  return IsEqual;
++}
++
++std::string CodeRegion::getTypeAsString(CodeRegionType CRType) {
++  switch (CRType) {
++  case autotuning::CodeRegionType::MachineBasicBlock:
++    return "machine_basic_block";
++  case autotuning::CodeRegionType::Loop:
++    return "loop";
++  case autotuning::CodeRegionType::Function:
++    return "function";
++  case autotuning::CodeRegionType::CallSite:
++    return "callsite";
++  case autotuning::CodeRegionType::LLVMParam:
++    return "llvm-param";
++  case autotuning::CodeRegionType::ProgramParam:
++    return "program-param";
++  case autotuning::CodeRegionType::Switch:
++    return "switch";
++  default:
++    return "other";
++  }
++}
++
++std::string CodeRegion::getHotnessAsString(HotnessType Hotness) {
++  switch (Hotness) {
++  case autotuning::HotnessType::Cold:
++    return "cold";
++  case autotuning::HotnessType::Hot:
++    return "hot";
++  default:
++    return "unknown";
++  }
++}
++
++void CodeRegion::setPassName(const std::string &NewPassName) {
++  this->PassName = generateName(NewPassName);
++}
++
++/* static */
++autotuning::CodeRegion CodeRegion::getInvalidInstance() {
++  static autotuning::CodeRegion Invalid =
++      CodeRegion(autotuning::CodeRegionType::Invalid);
++  return Invalid;
++}
++
++/* static */
++autotuning::CodeRegion CodeRegion::getEmptyInstance() {
++  static autotuning::CodeRegion Empty =
++      CodeRegion(autotuning::CodeRegionType::Empty);
++  return Empty;
++}
++
++//===----------------------------------------------------------------------===//
++// Container implementation
++//
++
++const CodeRegion &Container::getCodeRegion() const { return CR; }
++
++void Container::setCodeRegion(const CodeRegion &NewCR) { this->CR = NewCR; }
++
++template <typename T>
++bool Container::lookUpParams(const std::string &ParamsName, T &Value) const {
++  bool Found = false;
++  auto ConfigMapIterator = Engine.ParamTable.find(CR);
++  if (ConfigMapIterator != Engine.ParamTable.end()) {
++    ParameterManager InputParams = ConfigMapIterator->second;
++    Found = InputParams.findByName(ParamsName, Value);
++    if (Found) {
++      LLVM_DEBUG(dbgs() << ParamsName << " is set for the CodeRegion: \n"
++                        << "  Name: " << CR.getName() << "\n"
++                        << "  FuncName: " << CR.getFuncName() << "\n"
++                        << "  PassName: " << CR.getPassName() << "\n"
++                        << "  Type: " << CR.getTypeAsString() << "\n"
++                        << "  Hash: " << CR.getHash() << "\n"
++                        << "\n");
++    }
++  }
++  return Found;
++}
++
++bool Container::requiresIRDump(bool IsFunctionIR) const {
++  auto findBaselineRegion = [&]() -> bool {
++    for (auto &entry : Engine.TuningOpps)
++      if (!IsFunctionIR) {
++        if (CR.getSourceLoc() == entry.getSourceLoc())
++          return true;
++      } else {
++        if (CR.getFileName() == entry.getFileName() &&
++            CR.getFuncName() == entry.getFuncName())
++          return true;
++      }
++    return false;
++  };
++  auto findNonBaselineRegion = [&]() {
++    for (auto &entry : Engine.ParamTable)
++      if (!IsFunctionIR) {
++        if (CR.getSourceLoc() == entry.first.getSourceLoc())
++          return true;
++      } else {
++        if (CR.getFileName() == entry.first.getFileName() &&
++            CR.getFuncName() == entry.first.getFuncName())
++          return true;
++      }
++    return false;
++  };
++
++  if (CFGNumber == -1)
++    return findBaselineRegion();
++  else
++    return findNonBaselineRegion();
++}
++
++template bool Container::lookUpParams<int>(const std::string &ParamsName,
++                                           int &Value) const;
++template bool Container::lookUpParams<bool>(const std::string &ParamsName,
++                                            bool &Value) const;
++template bool
++Container::lookUpParams<std::string>(const std::string &ParamsName,
++                                     std::string &Value) const;
++template bool Container::lookUpParams<std::vector<std::string>>(
++    const std::string &ParamsName, std::vector<std::string> &Value) const;
++
++static unsigned int count(SmallVector<CallSiteLocation, 10> CallSiteLocs,
++                          CallSiteLocation Loc) {
++  unsigned int Count = 0;
++  for (unsigned int Idx = 0; Idx < CallSiteLocs.size(); ++Idx) {
++    if (Loc.Caller == CallSiteLocs[Idx].Caller &&
++        Loc.Callee == CallSiteLocs[Idx].Callee)
++      Count++;
++  }
++  return Count;
++}
++
++bool AutoTuningEngine::isThinLTOTuning() const { return ThinLTOTuning; }
++
++CodeRegionType AutoTuningEngine::convertPassToType(std::string PassName) {
++  auto Search = PTTMap.find(PassName);
++  if (Search == PTTMap.end())
++    llvm_unreachable(
++        "AutoTuningEngine: Invalid/unsupported optimization pass provided.\n");
++  return Search->second;
++}
++
++void AutoTuningEngine::insertCallSiteLoc(CallSiteLocation Loc) {
++  CallSiteLocs.emplace_back(Loc);
++}
++
++// If a function has multiple calls to same callee, then insert all the calls in
++// the CallSiteLocs vector which get available due to inlining of such calls.
++// It will use "Original Call Line No + New Call Line No" instead of using
++// "DebugLoc Line No".
++void AutoTuningEngine::updateCallSiteLocs(llvm::CallBase *OldCB,
++                                          llvm::CallBase *NewCB,
++                                          llvm::Function *Callee,
++                                          unsigned int Line) {
++  for (unsigned int Idx = 0; Idx < CallSiteLocs.size(); ++Idx) {
++    if (OldCB == CallSiteLocs[Idx].CB) {
++      CallSiteLocation Loc = CallSiteLocs[Idx];
++      Loc.CB = NewCB;
++      Loc.Callee = Callee;
++      Loc.SrcLoc.SourceLine = Loc.SrcLoc.SourceLine + Line;
++      CallSiteLocs.emplace_back(Loc);
++      break;
++    }
++  }
++}
++
++void AutoTuningEngine::cleanCallSiteLoc() {
++  unsigned int Size = CallSiteLocs.size();
++  unsigned int Idx = 0;
++  for (unsigned int I = 0; I < Size; ++I) {
++    CallSiteLocation Loc = CallSiteLocs[Idx];
++    unsigned int Count = count(CallSiteLocs, Loc);
++    if (Count == 1) {
++      CallSiteLocs.erase(CallSiteLocs.begin() + Idx);
++      continue;
++    }
++    Idx++;
++  }
++}
++
++void AutoTuningEngine::clearCallSiteLocs() { CallSiteLocs.clear(); }
++
++std::optional<unsigned int>
++AutoTuningEngine::getCallSiteLoc(llvm::CallBase *CB) {
++  for (unsigned int Idx = 0; Idx < CallSiteLocs.size(); ++Idx) {
++    if (CB == CallSiteLocs[Idx].CB)
++      return CallSiteLocs[Idx].SrcLoc.SourceLine;
++  }
++  return std::nullopt;
++}
++
++void AutoTuningEngine::addOpportunity(
++    const CodeRegion &OppCR,
++    std::map<std::string, std::string> BaselineConfig) {
++  if (!OppCR.Initialized)
++    return;
++
++  OppCR.setBaselineConfig(BaselineConfig);
++  if (!TuningOpps.contains(OppCR))
++    TuningOpps.insert(OppCR);
++  else if (OppCR.getHotness() != Unknown) {
++    // If OppCR already exists in TuningOpps with unknown hotness,
++    // then update it if the current hotness is hot/cold.
++    auto OppI = find(TuningOpps, OppCR);
++    if (OppI->getHotness() == Unknown)
++      OppI->setHotness(OppCR.getHotness());
++  }
++}
++
++void AutoTuningEngine::applyOppFilters(CodeRegions &CRs) {
++  CodeRegions NewCRs;
++  for (CodeRegion CR : CRs) {
++    if (AutotuningOutputFilter.getNumOccurrences() > 0) {
++      bool IsMatched = false;
++      for (auto CRType : AutotuningOutputFilter) {
++        if (CRType == CR.getType()) {
++          IsMatched = true;
++          break;
++        }
++      }
++      // Filter out the CodeRegion if its type fails to match any types
++      // specified from the command line.
++      if (!IsMatched)
++        continue;
++    }
++    if (SizeThreshold.getNumOccurrences() > 0 && CR.getSize() < SizeThreshold)
++      continue;
++    if (ExcludeColdCodeRegion && CR.isCold()) {
++      LLVM_DEBUG(dbgs() << "Skip CodeRegion with cold function "
++                        << CR.getFuncName() << "\n");
++      continue;
++    }
++    if (HotCodeRegionOnly && !CR.isHot()) {
++      LLVM_DEBUG(dbgs() << "Skip CodeRegion with " << CR.getHotnessAsString()
++                        << " function " << CR.getFuncName() << "\n");
++      continue;
++    }
++    NewCRs.insert(CR);
++    LLVM_DEBUG(dbgs() << "CodeRegion added as an tuning opportunity: \n"
++                      << "  Name: " << CR.getName() << "\n"
++                      << "  FuncName: " << CR.getFuncName() << "\n"
++                      << "  PassName: " << CR.getPassName() << "\n"
++                      << "  Type: " << CR.getTypeAsString() << "\n"
++                      << "  Size: " << CR.getSize() << "\n"
++                      << "  Hotness: " << CR.getHotnessAsString() << "\n"
++                      << "  Hash:   " << CR.getHash() << "\n"
++                      << "  Location:   " << CR.getSourceLoc().SourceFilePath
++                      << "; " << CR.getSourceLoc().SourceLine << "; "
++                      << CR.getSourceLoc().SourceColumn << "\n\n");
++  }
++  if (AutotuningOutputFilter.getNumOccurrences() == 0 ||
++      std::find(AutotuningOutputFilter.begin(), AutotuningOutputFilter.end(),
++                Other) != AutotuningOutputFilter.end()) {
++    // Add an empty CodeRegion with ModuleID as an tuning opportunity.
++    // It could be used to represent a module level code region.
++    autotuning::CodeRegion GlobalCR =
++        CodeRegion(ModuleID, "none", "all", Other);
++    GlobalCR.setHash(llvm::hash_combine(ModuleID, Other));
++    NewCRs.insert(GlobalCR);
++    LLVM_DEBUG(dbgs() << "Module added as an tuning opportunity: \n"
++                      << "  Name: " << GlobalCR.getName() << "\n"
++                      << "  Hash: " << GlobalCR.getHash() << "\n"
++                      << "\n");
++  }
++
++  // Include LLVMParam as an tuning opportunity only if it is specified with
++  // -auto-tuning-type-filter.
++  if (std::find(AutotuningOutputFilter.begin(), AutotuningOutputFilter.end(),
++                LLVMParam) != AutotuningOutputFilter.end())
++    NewCRs.insert(CodeRegion(ModuleID, "none", "none", LLVMParam));
++
++  if (std::find(AutotuningOutputFilter.begin(), AutotuningOutputFilter.end(),
++                ProgramParam) != AutotuningOutputFilter.end())
++    NewCRs.insert(CodeRegion(ModuleID, "none", "none", ProgramParam));
++
++  CRs = NewCRs;
++}
++
++bool AutoTuningEngine::applyFunctionFilter(std::string FuncName) {
++  if (AutotuningFunctionFilter.getNumOccurrences() == 0)
++    return true;
++
++  for (std::string FunctionFilter : AutotuningFunctionFilter)
++    if (FuncName == FunctionFilter)
++      return true;
++
++  return false;
++}
++
++void AutoTuningEngine::initContainer(Container *Container,
++                                     const std::string &PassName,
++                                     const StringRef FuncName,
++                                     bool AddOpportunity,
++                                     unsigned int Invocation) {
++  if (Enabled) {
++    if (!isTuningAllowedForType(convertPassToType(PassName)) &&
++        !(isGenerateOutput() &&
++          AutotuningOutputFilter.getNumOccurrences() == 0))
++      return;
++
++    if (!applyFunctionFilter(FuncName.str()))
++      return;
++
++    // The attributes of a Container could potentially change overtime even with
++    // the same pass if the associated pass is invoked multiple times at
++    // different places in the pipeline. Therefore, we need to initCodeRegion
++    // every time when this function is called to ensure the CodeRegion with the
++    // latest information will be added as tuning opportunities.
++    Container->initCodeRegion();
++    if (Container->CR.getType() == autotuning::CodeRegionType::Invalid)
++      return;
++
++    uint64_t hash = Container->computeStructuralHash();
++    CodeRegion &OppCR = Container->CR;
++    if (GenerateOutput) {
++      if (OppCR.getSize() < SizeThreshold)
++        return;
++      if (ExcludeColdCodeRegion && OppCR.isCold()) {
++        LLVM_DEBUG(dbgs() << "Skip CodeRegion with cold function "
++                          << OppCR.getFuncName() << "\n");
++        return;
++      }
++      if (HotCodeRegionOnly && !OppCR.isHot()) {
++        LLVM_DEBUG(dbgs() << "Skip CodeRegion with "
++                          << OppCR.getHotnessAsString() << " function "
++                          << OppCR.getFuncName() << "\n");
++        return;
++      }
++    }
++    OppCR.setPassName(PassName);
++    OppCR.setHash(hash);
++    OppCR.setInvocation(Invocation);
++    OppCR.Initialized = true;
++    if (AddOpportunity)
++      addOpportunity(OppCR);
++  }
++}
++
++bool AutoTuningEngine::shouldRunOptPass(std::string Filename,
++                                        std::string Pass) {
++  return OppPassList.count(Filename) ? OppPassList[Filename].count(Pass)
++                                     : false;
++}
++
++Error AutoTuningEngine::init(const std::string &Module) {
++  ParseInput = false;
++  if (std::optional<std::string> MaybePath =
++          llvm::sys::Process::GetEnv("AUTOTUNE_INPUT")) {
++    InputFile = *MaybePath;
++    ParseInput = true;
++  } else if (InputFile.getNumOccurrences() > 0) {
++    ParseInput = true;
++  }
++
++  GenerateOutput = false;
++  if (OutputOppDir.getNumOccurrences() > 0)
++    GenerateOutput = true;
++
++  // Invocation of any of the following command line options
++  // (auto-tuning-input and auto-tuning-opp) or env variable
++  // AUTOTUNE_ALL_INPUT can enable auto-tuning mode.
++  if (ParseInput || GenerateOutput) {
++    Enabled = true;
++    // Generate absolute path and remove the base directory (if available).
++    // A relative path will be used as (coarse-grain) code region name.
++    llvm::SmallString<128> ModuleVec = StringRef(Module);
++    llvm::sys::fs::make_absolute(ModuleVec);
++    if (ProjectDir.size() && ModuleVec.startswith(ProjectDir))
++      ModuleID = ModuleVec.substr(ProjectDir.size()).str();
++    else
++      ModuleID = std::string(ModuleVec);
++  }
++
++  // Initialization of map to be used for pass-name to CodeRegionType
++  // conversion.
++  PTTMap = {{"loop-unroll", Loop},
++            {"loop-vectorize", Loop},
++            {"inline", CallSite},
++            {"machine-scheduler", MachineBasicBlock},
++            {"switch-lowering", Switch},
++            {"autotuning-dump", Function}};
++
++  if (ParseInput) {
++    // Currently we only support yaml format for input.
++    if (Error E = AutoTuningRemarkManager::read(*this, InputFile, "yaml")) {
++      errs() << "Error parsing auto-tuning input.\n";
++      return E;
++    } else {
++      LLVM_DEBUG(dbgs() << "AutoTuningEngine is initialized.\n"
++                        << " Size of ParamTable: " << this->ParamTable.size()
++                        << "\n");
++      if (LLVMParams.size())
++        LLVM_DEBUG(dbgs() << "AutoTuner: LLVMParams applied.");
++      if (ProgramParams.size())
++        LLVM_DEBUG(dbgs() << "AutoTuner: ProgramParams applied.\n");
++    }
++  }
++
++  for (auto CRType : AutotuningOutputFilter)
++    CodeRegionFilterTypes.insert(CRType);
++
++  if (GenerateOutput) {
++    switch (AutoTuningCompileMode) {
++    case CoarseGrain: {
++      bool Valid = false;
++      if (AutotuningOutputFilter.getNumOccurrences() > 0) {
++        Valid = true;
++        for (auto CRType : AutotuningOutputFilter)
++          if (CRType != LLVMParam) {
++            Valid = false;
++            break;
++          }
++      }
++      if (!Valid) {
++        AutoTuningCompileMode = Inactive;
++        errs() << "AutoTunerCompile: Code region type filtering does not match"
++                  " with incremental compilation option.\n"
++                  "Disabling incremental compilation.\n";
++      }
++      break;
++    }
++    case FineGrain: {
++      bool Valid = false;
++      if (AutotuningOutputFilter.getNumOccurrences() > 0) {
++        Valid = true;
++        for (auto CRType : AutotuningOutputFilter) {
++          if (CRType != Loop && CRType != CallSite && CRType != Function) {
++            Valid = false;
++            break;
++          }
++        }
++      }
++      if (!Valid) {
++        AutoTuningCompileMode = Inactive;
++        errs() << "AutoTunerCompile: Code region type filtering does not match"
++                  "with incremental compilation option.\n"
++                  "Disabling incremental compilation.\n";
++      }
++      break;
++    }
++    case Basic:
++    case Inactive:
++      break;
++    default:
++      llvm_unreachable("AutoTuningCompile: Unknown AutoTuner Incremental "
++                       "Compilation mode.\n");
++    }
++  }
++
++  MLEnabled = (CFGNumber.getNumOccurrences() > 0);
++  if (EnableAutoTuningDump || MLEnabled)
++    DumpEnabled = true;
++  return Error::success();
++}
++
++llvm::Expected<int> AutoTuningEngine::getConfigNumber() {
++  if (!isMLEnabled()) {
++    std::string errorMsg =
++        "No Autotuner configuration specified; ML guidance is unavailable.";
++    return createStringError(inconvertibleErrorCode(), errorMsg);
++  } else
++    return CFGNumber;
++}
++
++Error AutoTuningEngine::finalize() {
++  if (OutputOppDir.getNumOccurrences() > 0) {
++    // Apply filters.
++    applyOppFilters(TuningOpps);
++    if (!TuningOpps.empty()) {
++      if (Error E = AutoTuningRemarkManager::dump(
++              *this, OutputOppDir, OutputFormat, RemarksPasses)) {
++        errs() << "Error generating auto-tuning opportunities.\n";
++        return E;
++      }
++    }
++
++    // Clear these two global lists when ending the auto-tuning
++    // in case of redundant information
++    TuningOpps.clear();
++  }
++  return Error::success();
++}
++
++template <typename T>
++bool AutoTuningEngine::lookUpGlobalParams(const std::string &ParamsName,
++                                          T &Value) const {
++  bool Found = GlobalParams.findByName(ParamsName, Value);
++  if (Found) {
++    LLVM_DEBUG(dbgs() << "Global Variable " << ParamsName << " is set.\n");
++  }
++  return Found;
++}
++
++template bool
++AutoTuningEngine::lookUpGlobalParams<int>(const std::string &ParamsName,
++                                          int &Value) const;
++template bool
++AutoTuningEngine::lookUpGlobalParams<bool>(const std::string &ParamsName,
++                                           bool &Value) const;
++template bool
++AutoTuningEngine::lookUpGlobalParams<std::string>(const std::string &ParamsName,
++                                                  std::string &Value) const;
++template bool AutoTuningEngine::lookUpGlobalParams<std::vector<std::string>>(
++    const std::string &ParamsName, std::vector<std::string> &Value) const;
++
++class AutoTuningEngine Engine;
++
++} // namespace autotuning
++
++#endif
+diff --git a/llvm/lib/AutoTuner/AutoTuningRemarkManager.cpp b/llvm/lib/AutoTuner/AutoTuningRemarkManager.cpp
+new file mode 100644
+index 000000000000..3e0506e534c4
+--- /dev/null
++++ b/llvm/lib/AutoTuner/AutoTuningRemarkManager.cpp
+@@ -0,0 +1,299 @@
++#if defined(ENABLE_AUTOTUNER)
++//===- llvm/AutoTuner/AutoTuningRemarkManager.cpp - Remark Manager --------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++//
++// This file contains the implementation of for inputting and outputting remarks
++// for AutoTuning.
++//
++//===----------------------------------------------------------------------===//
++
++#include "llvm/AutoTuner/AutoTuningRemarkManager.h"
++#include "llvm/ADT/StringRef.h"
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/AutoTuner/AutoTuningRemarkStreamer.h"
++#include "llvm/IR/DebugInfoMetadata.h"
++#include "llvm/IR/LLVMRemarkStreamer.h"
++#include "llvm/Remarks/Remark.h"
++#include "llvm/Remarks/RemarkFormat.h"
++#include "llvm/Remarks/RemarkParser.h"
++#include "llvm/Remarks/RemarkSerializer.h"
++#include "llvm/Remarks/RemarkStreamer.h"
++#include "llvm/Support/CommandLine.h"
++#include "llvm/Support/Debug.h"
++#include "llvm/Support/FileSystem.h"
++#include "llvm/Support/MemoryBuffer.h"
++#include "llvm/Support/Path.h"
++#include "llvm/Support/ToolOutputFile.h"
++
++// Enable debug messages for AutoTuner.
++#define DEBUG_TYPE "autotuning"
++
++using namespace llvm;
++using namespace autotuning;
++
++// Helper functions.
++namespace {
++// Convert string into CodeRegionType.
++Expected<CodeRegionType> StringToCodeRegionType(const std::string &CRType) {
++  if (CRType == "machine_basic_block")
++    return autotuning::CodeRegionType::MachineBasicBlock;
++  else if (CRType == "loop")
++    return autotuning::CodeRegionType::Loop;
++  else if (CRType == "function")
++    return autotuning::CodeRegionType::Function;
++  else if (CRType == "callsite")
++    return autotuning::CodeRegionType::CallSite;
++  else if (CRType == "llvm-param")
++    return autotuning::CodeRegionType::LLVMParam;
++  else if (CRType == "program-param")
++    return autotuning::CodeRegionType::ProgramParam;
++  else if (CRType == "switch")
++    return autotuning::CodeRegionType::Switch;
++  else if (CRType == "other")
++    return autotuning::CodeRegionType::Other;
++  else
++    return make_error<StringError>("Unsupported CodeRegionType:" + CRType,
++                                   inconvertibleErrorCode());
++}
++
++// Remark -> autotuning::ParameterManager
++ParameterManager RemarkToParameterManager(const remarks::Remark &Remark) {
++  // Create Parameters from a remark.
++  ParameterManager ParamManager;
++  for (const remarks::Argument &Arg : Remark.Args) {
++    int Value = 0;
++    if (!Arg.Val.getAsInteger(10, Value))
++      // If no errors
++      ParamManager.add(Arg.Key.str(), Value);
++    else if (Arg.Val == "true")
++      ParamManager.add(Arg.Key.str(), true);
++    else if (Arg.Val == "false")
++      ParamManager.add(Arg.Key.str(), false);
++    // If there is a value of vector type
++    else if (Arg.VectorVal) {
++      std::vector<std::string> Strings;
++      for (const StringRef &Val : *Arg.VectorVal) {
++        Strings.push_back(Val.str());
++      }
++      ParamManager.add(Arg.Key.str(), Strings);
++    } else
++      // Add as String Value
++      ParamManager.add(Arg.Key.str(), Arg.Val);
++  }
++
++  return ParamManager;
++}
++
++// Remark -> std::unordered_map<std::string, std::string>
++std::unordered_map<std::string, std::string>
++RemarkToStringMap(const remarks::Remark &Remark) {
++  std::unordered_map<std::string, std::string> LLVMParams;
++  for (const remarks::Argument &Arg : Remark.Args) {
++    // Add as String Value
++    LLVMParams[Arg.Key.str()] = Arg.Val.str();
++  }
++  return LLVMParams;
++}
++
++// Remark -> autotuning::SourceLocation
++SourceLocation RemarkToSourceLocation(const remarks::Remark &Remark) {
++  SourceLocation Location;
++  if (Remark.Loc) {
++    StringRef File = Remark.Loc->SourceFilePath;
++    unsigned Line = Remark.Loc->SourceLine;
++    unsigned Column = Remark.Loc->SourceColumn;
++    Location = {File.str(), Line, Column};
++  }
++  return Location;
++}
++
++// Remark -> autotuning::CodeRegion
++CodeRegion RemarkToCodeRegion(const remarks::Remark &Remark,
++                              Expected<CodeRegionType> &Type) {
++  // Create a SourceLocation from a remark.
++  SourceLocation Location = RemarkToSourceLocation(Remark);
++  // Create a CodeRegion from a remark.
++  CodeRegion CR = CodeRegion(Remark.RemarkName.str(), Remark.FunctionName.str(),
++                             Remark.PassName.str(), Type.get(), Location);
++  if (Remark.CodeRegionHash)
++    CR.setHash(Remark.CodeRegionHash.value_or(0));
++  if (Remark.Invocation)
++    CR.setInvocation(Remark.Invocation.value_or(0));
++
++  return CR;
++}
++
++Expected<std::unique_ptr<ToolOutputFile>> emitAutoTuningRemarks(
++    const StringRef RemarksFilename, const StringRef RemarksFormat,
++    const StringRef RemarksPasses, const CodeRegions &CRList) {
++  if (RemarksFilename.empty())
++    return nullptr;
++  // Parse remark format. Options are yaml, yaml-strtab and bitstream.
++  Expected<remarks::Format> Format = remarks::parseFormat(RemarksFormat);
++  if (Error E = Format.takeError())
++    return make_error<LLVMRemarkSetupFormatError>(std::move(E));
++
++  std::error_code EC;
++  auto Flags =
++      *Format == remarks::Format::YAML ? sys::fs::OF_Text : sys::fs::OF_None;
++  auto RemarksFile =
++      std::make_unique<ToolOutputFile>(RemarksFilename, EC, Flags);
++  if (EC)
++    return make_error<LLVMRemarkSetupFormatError>(errorCodeToError(EC));
++  // Create a remark serializer to emit code regions.
++  Expected<std::unique_ptr<remarks::RemarkSerializer>> RemarkSerializer =
++      remarks::createRemarkSerializer(
++          *Format, remarks::SerializerMode::Separate, RemarksFile->os());
++
++  if (Error E = RemarkSerializer.takeError())
++    return make_error<LLVMRemarkSetupFormatError>(std::move(E));
++  // Create remark streamer based on the serializer.
++  remarks::RemarkStreamer RStreamer =
++      remarks::RemarkStreamer(std::move(*RemarkSerializer), RemarksFilename);
++  AutoTuningRemarkStreamer Streamer(RStreamer);
++
++  if (!RemarksPasses.empty())
++    if (Error E = Streamer.setFilter(RemarksPasses))
++      return make_error<LLVMRemarkSetupFormatError>(std::move(E));
++  // Emit CodeRegions in Remark format.
++  for (const CodeRegion &CR : CRList) {
++    Streamer.emit(CR);
++  }
++  return std::move(RemarksFile);
++}
++} // namespace
++
++llvm::Error AutoTuningRemarkManager::read(AutoTuningEngine &E,
++                                          const std::string &InputFileName,
++                                          const std::string &RemarksFormat) {
++  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
++      MemoryBuffer::getFile(InputFileName.c_str());
++  if (std::error_code EC = Buf.getError())
++    return make_error<StringError>(
++        "Can't open file " + InputFileName + ": " + EC.message(), EC);
++  // Parse remark format. Options are yaml, yaml-strtab and bitstream.
++  Expected<remarks::Format> Format = remarks::parseFormat(RemarksFormat);
++  if (!Format)
++    return Format.takeError();
++
++  Expected<std::unique_ptr<remarks::RemarkParser>> MaybeParser =
++      remarks::createRemarkParserFromMeta(*Format, (*Buf)->getBuffer());
++  if (!MaybeParser) {
++    return MaybeParser.takeError();
++  }
++  remarks::RemarkParser &Parser = **MaybeParser;
++
++  while (true) {
++    Expected<std::unique_ptr<remarks::Remark>> MaybeRemark = Parser.next();
++    if (!MaybeRemark) {
++      Error E = MaybeRemark.takeError();
++      if (E.isA<remarks::EndOfFileError>()) {
++        // EOF.
++        consumeError(std::move(E));
++        break;
++      }
++      return E;
++    }
++    const remarks::Remark &Remark = **MaybeRemark;
++
++    if (Remark.RemarkType != remarks::Type::AutoTuning)
++      continue;
++
++    if (!Remark.CodeRegionType)
++      return make_error<StringError>("CodeRegionType field is missing.",
++                                     inconvertibleErrorCode());
++    Expected<CodeRegionType> Type =
++        StringToCodeRegionType((*Remark.CodeRegionType).str());
++    if (!Type)
++      return Type.takeError();
++    CodeRegionType CRType = Type.get();
++    // If CodeRegionType is Other, this remark corresponds to global
++    // parameters, and no need to create a CodeRegion object. Check if the
++    // Remark of global parameters is for the current Module.
++    if (CRType == autotuning::Other && Remark.RemarkName == Engine.ModuleID) {
++      Engine.GlobalParams = RemarkToParameterManager(Remark);
++      continue;
++    }
++    if (CRType == autotuning::LLVMParam &&
++        Remark.RemarkName == Engine.ModuleID) {
++      Engine.LLVMParams = RemarkToStringMap(Remark);
++      continue;
++    }
++    if (CRType == autotuning::ProgramParam &&
++        Remark.RemarkName == Engine.ModuleID) {
++      Engine.ProgramParams = RemarkToStringMap(Remark);
++      continue;
++    }
++    if (Engine.isThinLTOTuning() &&
++        (CRType == autotuning::CallSite || CRType == autotuning::Loop ||
++         CRType == autotuning::MachineBasicBlock ||
++         CRType == autotuning::Function)) {
++      LLVM_DEBUG(dbgs() << "AutoTuner does not support tuning of "
++                        << CodeRegion::getTypeAsString(CRType)
++                        << " for thinLTO durning link-time optimization. "
++                           "Ignoring current code region.\n");
++      continue;
++    }
++    // Create a SourceLocation from a remark.
++    CodeRegion CR = RemarkToCodeRegion(Remark, Type);
++    ParameterManager ParamManager = RemarkToParameterManager(Remark);
++    // Add the CodeRegion-ParameterManager entry into LoopUpTable.
++    Engine.ParamTable[CR] = ParamManager;
++
++    std::string Filename = CR.getSourceLoc().SourceFilePath;
++    size_t Pos = Filename.rfind(".");
++    if (Pos != std::string::npos)
++      Filename.erase(Pos, Filename.size());
++    Engine.OppPassList[Filename].insert(CR.getPassName());
++    Engine.CodeRegionFilterTypes.insert(CR.getType());
++  }
++  return Error::success();
++}
++
++Error AutoTuningRemarkManager::dump(const autotuning::AutoTuningEngine &E,
++                                    const std::string &DirName,
++                                    const std::string &RemarksFormat,
++                                    const std::string &RemarksPasses) {
++  // Change to absolute path.
++  SmallString<256> OutputPath = StringRef(DirName);
++  sys::fs::make_absolute(OutputPath);
++
++  // Make sure the new output directory exists, creating it if necessary.
++  if (std::error_code EC = sys::fs::create_directories(OutputPath)) {
++    return make_error<StringError>("could not create directory: " +
++                                       Twine(OutputPath) + ": " + EC.message(),
++                                   EC);
++  }
++  if (!Engine.TuningOpps.empty()) {
++    StringRef ModelFileName = sys::path::filename(Engine.ModuleID);
++    sys::path::append(OutputPath, ModelFileName + "." + RemarksFormat);
++
++    int i = 1; // Output file suffix starts from 1.
++    // Check all exiting xml files xml.1...i and create a new file
++    // suffix.(i+1).
++    while (sys::fs::exists(OutputPath)) {
++      sys::path::remove_filename(OutputPath);
++      sys::path::append(OutputPath,
++                        ModelFileName + "." + RemarksFormat + "." + Twine(i));
++      i += 1;
++    }
++    Expected<std::unique_ptr<ToolOutputFile>> RemarksFileOrErr =
++        emitAutoTuningRemarks(OutputPath, RemarksFormat, RemarksPasses,
++                              Engine.TuningOpps);
++    if (Error E = RemarksFileOrErr.takeError()) {
++      return E;
++    }
++
++    std::unique_ptr<ToolOutputFile> RemarksFile = std::move(*RemarksFileOrErr);
++    if (RemarksFile)
++      RemarksFile->keep();
++  }
++  return Error::success();
++}
++
++#endif
+diff --git a/llvm/lib/AutoTuner/AutoTuningRemarkStreamer.cpp b/llvm/lib/AutoTuner/AutoTuningRemarkStreamer.cpp
+new file mode 100644
+index 000000000000..0516c055a139
+--- /dev/null
++++ b/llvm/lib/AutoTuner/AutoTuningRemarkStreamer.cpp
+@@ -0,0 +1,55 @@
++#if defined(ENABLE_AUTOTUNER)
++// ===---------- llvm/AutoTuner/AutoTuningRemarkStreamer.cpp --------------===//
++//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++// Copyright (C) 2017-2022, Huawei Technologies Co., Ltd. All rights reserved.
++//
++// ===---------------------------------------------------------------------===//
++//
++// This file contains the implementation of the conversion between AutoTuner
++// CodeRegions and serializable remarks::Remark objects.
++//
++// ===---------------------------------------------------------------------===//
++
++#include "llvm/AutoTuner/AutoTuningRemarkStreamer.h"
++
++using namespace llvm;
++
++// autotuning::CodeRegion -> Remark
++remarks::Remark
++AutoTuningRemarkStreamer::toRemark(const autotuning::CodeRegion &CR) {
++  remarks::Remark R; // The result.
++  R.RemarkType = remarks::Type::AutoTuning;
++  R.PassName = CR.getPassName();
++  R.RemarkName = CR.getName();
++  R.FunctionName = CR.getFuncName();
++  const autotuning::SourceLocation &Location = CR.getSourceLoc();
++  if (Location)
++    R.Loc = remarks::RemarkLocation{Location.SourceFilePath,
++                                    Location.SourceLine, Location.SourceColumn};
++  R.CodeRegionType = CR.getTypeAsString();
++  R.CodeRegionHash = CR.getHash();
++  R.AutoTunerOptions = CR.getAutoTunerOptions();
++  R.Invocation = CR.getInvocation();
++  R.BaselineConfig = CR.getBaselineConfig();
++  return R;
++}
++
++void AutoTuningRemarkStreamer::emit(const autotuning::CodeRegion &CR) {
++  if (!RS.matchesFilter(CR.getPassName()))
++    return;
++
++  // First, convert the code region to a remark.
++  remarks::Remark R = toRemark(CR);
++  // Then, emit the remark through the serializer.
++  RS.getSerializer().emit(R);
++}
++
++Error AutoTuningRemarkStreamer::setFilter(StringRef Filter) {
++  return RS.setFilter(Filter);
++}
++#endif
+diff --git a/llvm/lib/AutoTuner/CMakeLists.txt b/llvm/lib/AutoTuner/CMakeLists.txt
+new file mode 100644
+index 000000000000..c618474fe5ae
+--- /dev/null
++++ b/llvm/lib/AutoTuner/CMakeLists.txt
+@@ -0,0 +1,11 @@
++add_llvm_component_library(LLVMAutoTuner
++  AutoTuning.cpp
++  AutoTuningRemarkManager.cpp
++  AutoTuningRemarkStreamer.cpp
++
++  ADDITIONAL_HEADER_DIRS
++  ${LLVM_MAIN_INCLUDE_DIR}/llvm/AutoTuner
++
++  LINK_COMPONENTS
++  Remarks
++)
+\ No newline at end of file
+diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt
+index 283baa6090eb..966137c0f71f 100644
+--- a/llvm/lib/CMakeLists.txt
++++ b/llvm/lib/CMakeLists.txt
+@@ -28,6 +28,7 @@ add_subdirectory(Object)
+ add_subdirectory(ObjectYAML)
+ add_subdirectory(Option)
+ add_subdirectory(Remarks)
++add_subdirectory(AutoTuner)
+ add_subdirectory(Debuginfod)
+ add_subdirectory(DebugInfo)
+ add_subdirectory(DWP)
+diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
+index 106571b9061b..9029dc7bb3d9 100644
+--- a/llvm/lib/CodeGen/CMakeLists.txt
++++ b/llvm/lib/CodeGen/CMakeLists.txt
+@@ -273,6 +273,7 @@ add_llvm_component_library(LLVMCodeGen
+ 
+   LINK_COMPONENTS
+   Analysis
++  AutoTuner
+   BitReader
+   BitWriter
+   CodeGenTypes
+diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
+index 5a005ba7b414..9dcb3833ab91 100644
+--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
++++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
+@@ -29,6 +29,24 @@ using namespace llvm;
+ 
+ #define DEBUG_TYPE "calcspillweights"
+ 
++#if defined(ENABLE_AUTOTUNER)
++static cl::opt<float> LoopWeight(
++    "reg-spill-loop-weight", cl::Hidden,
++    cl::desc(
++        "Tunable extra weight to what looks like a loop induction variable"),
++    cl::init(3));
++
++static cl::opt<float> RemaWeight(
++    "reg-spill-rematerialize-weight", cl::Hidden,
++    cl::desc("Tunable reduced weight giving re-materialize oppotunities"),
++    cl::init(0.5f));
++
++static cl::opt<float>
++    HintWeight("reg-spill-hint-weight", cl::Hidden,
++               cl::desc("Tunable weakly boost weight of hinted registers"),
++               cl::init(1.01f));
++#endif
++
+ void VirtRegAuxInfo::calculateSpillWeightsAndHints() {
+   LLVM_DEBUG(dbgs() << "********** Compute Spill Weights **********\n"
+                     << "********** Function: " << MF.getName() << '\n');
+@@ -252,7 +270,11 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
+ 
+       // Give extra weight to what looks like a loop induction variable update.
+       if (Writes && IsExiting && LIS.isLiveOutOfMBB(LI, MBB))
++#if defined(ENABLE_AUTOTUNER)
++        Weight *= LoopWeight;
++#else
+         Weight *= 3;
++#endif
+ 
+       TotalWeight += Weight;
+     }
+@@ -288,7 +310,11 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
+     }
+ 
+     // Weakly boost the spill weight of hinted registers.
++#if defined(ENABLE_AUTOTUNER)
++    TotalWeight *= HintWeight;
++#else
+     TotalWeight *= 1.01F;
++#endif
+   }
+ 
+   // If the live interval was already unspillable, leave it that way.
+@@ -315,7 +341,11 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
+   // FIXME: this gets much more complicated once we support non-trivial
+   // re-materialization.
+   if (isRematerializable(LI, LIS, VRM, *MF.getSubtarget().getInstrInfo()))
++#if defined(ENABLE_AUTOTUNER)
++    TotalWeight *= RemaWeight;
++#else
+     TotalWeight *= 0.5F;
++#endif
+ 
+   if (IsLocalSplitArtifact)
+     return normalize(TotalWeight, Start->distance(*End), NumInstr);
+diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
+index 231544494c32..327cd40f86a4 100644
+--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
++++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
+@@ -37,6 +37,9 @@
+ #include "llvm/Support/raw_ostream.h"
+ #include "llvm/Target/TargetMachine.h"
+ #include <algorithm>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/IR/StructuralHash.h"
++#endif
+ #include <cmath>
+ using namespace llvm;
+ 
+@@ -1703,6 +1706,39 @@ MachineBasicBlock::livein_iterator MachineBasicBlock::livein_begin() const {
+   return LiveIns.begin();
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++uint64_t MachineBasicBlock::computeStructuralHash() {
++  return StructuralHash(*this);
++}
++
++void MachineBasicBlock::initCodeRegion() {
++  std::string BasicBlockName =
++      ("%bb." + Twine(this->getNumber()) + ":" + this->getName()).str();
++  MachineFunction *MF = this->getParent();
++  StringRef FuncName = MF->getName();
++
++  autotuning::CodeRegion CR;
++  if (!this->empty()) {
++    const DebugLoc &StartLoc = this->front().getDebugLoc();
++    CR = autotuning::CodeRegion(BasicBlockName, FuncName.data(),
++                                autotuning::CodeRegionType::MachineBasicBlock,
++                                StartLoc);
++  } else {
++    CR = autotuning::CodeRegion(BasicBlockName, FuncName.data(),
++                                autotuning::CodeRegionType::MachineBasicBlock);
++  }
++  // Compute the number of non-debug IR instructions in this MBB.
++  unsigned NumInstrs = std::distance(this->getFirstNonDebugInstr(),
++                                     this->getLastNonDebugInstr());
++  CR.setSize(NumInstrs);
++  // Compute hotness.
++  autotuning::HotnessType Hotness = MF->getFunction().ATEFunction.getHotness();
++  CR.setHotness(Hotness);
++
++  this->setCodeRegion(CR);
++}
++#endif
++
+ MachineBasicBlock::liveout_iterator MachineBasicBlock::liveout_begin() const {
+   const MachineFunction &MF = *getParent();
+   assert(MF.getProperties().hasProperty(
+diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
+index ba5432459d12..caccc9e5fad4 100644
+--- a/llvm/lib/CodeGen/MachineScheduler.cpp
++++ b/llvm/lib/CodeGen/MachineScheduler.cpp
+@@ -569,6 +569,12 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
+   for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end();
+        MBB != MBBEnd; ++MBB) {
+ 
++#if defined(ENABLE_AUTOTUNER)
++    // before visiting this MBB
++    // if AutoTuning is enabled, initialize this MBB for auto-tuning
++    autotuning::Engine.initContainer(&*MBB, DEBUG_TYPE);
++#endif
++
+     Scheduler.startBlock(&*MBB);
+ 
+ #ifndef NDEBUG
+@@ -3244,6 +3250,44 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
+     RegionPolicy.ShouldTrackLaneMasks = false;
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // AUTO-TUNING - Look up for MMB level scheduling direction if AutoTuning is
++  // enabled
++  if (autotuning::Engine.isEnabled()) {
++    MachineBasicBlock &MBB = *Begin->getParent();
++
++    bool NewForceBottomUp = false;
++    // Look up from xml file, and overwrite values
++    bool IsForceBottomUpSet =
++        MBB.lookUpParams<bool>("ForceBottomUp", NewForceBottomUp);
++
++    bool NewForceForceTopDown = false;
++    bool IsForceTopDownSet =
++        MBB.lookUpParams<bool>("ForceTopDown", NewForceForceTopDown);
++
++    assert((!NewForceBottomUp || !NewForceForceTopDown) &&
++           "BottomUp and TopDown cannot both set to true");
++
++    if (IsForceBottomUpSet) {
++      RegionPolicy.OnlyBottomUp = NewForceBottomUp;
++      if (RegionPolicy.OnlyBottomUp) {
++        RegionPolicy.OnlyTopDown = false;
++      }
++    }
++
++    if (IsForceTopDownSet) {
++      RegionPolicy.OnlyTopDown = NewForceForceTopDown;
++      if (RegionPolicy.OnlyTopDown) {
++        RegionPolicy.OnlyBottomUp = false;
++      }
++    }
++
++    if (IsForceBottomUpSet || IsForceTopDownSet) {
++      return;
++    }
++  }
++#endif
++
+   // Check -misched-topdown/bottomup can force or unforce scheduling direction.
+   // e.g. -misched-bottomup=false allows scheduling in both directions.
+   assert((!ForceTopDown || !ForceBottomUp) &&
+diff --git a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
+index 36a02d5beb4b..d4ac95d534ed 100644
+--- a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
++++ b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
+@@ -16,6 +16,9 @@
+ #include "llvm/CodeGen/MachineJumpTableInfo.h"
+ #include "llvm/CodeGen/TargetLowering.h"
+ #include "llvm/Target/TargetMachine.h"
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ 
+ using namespace llvm;
+ using namespace SwitchCG;
+@@ -61,7 +64,23 @@ void SwitchCG::SwitchLowering::findJumpTables(CaseClusterVector &Clusters,
+   if (!TLI->areJTsAllowed(SI->getParent()->getParent()))
+     return;
+ 
++#if defined(ENABLE_AUTOTUNER)
++  unsigned MinJumpTableEntries = TLI->getMinimumJumpTableEntries();
++  // Overwrite MinJumpTableEntries when it is set by Autotuner
++  if (autotuning::Engine.isEnabled()) {
++    autotuning::Engine.initContainer(SI->ATESwitchInst.get(),
++                                     "switch-lowering");
++
++    int NewValue = 0; // the int value is set by lookUpParams()
++    bool Changed =
++        SI->ATESwitchInst->lookUpParams<int>("MinJumpTableEntries", NewValue);
++    if (Changed)
++      MinJumpTableEntries = NewValue;
++  }
++#else
+   const unsigned MinJumpTableEntries = TLI->getMinimumJumpTableEntries();
++#endif
++
+   const unsigned SmallNumberOfEntries = MinJumpTableEntries / 2;
+ 
+   // Bail if not enough cases.
+diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
+index df753b91ff90..af77e6c2dc4d 100644
+--- a/llvm/lib/IR/AsmWriter.cpp
++++ b/llvm/lib/IR/AsmWriter.cpp
+@@ -2602,11 +2602,21 @@ public:
+   void writeAllAttributeGroups();
+ 
+   void printTypeIdentities();
++#if defined(ENABLE_AUTOTUNER)
++  void printGlobal(const GlobalVariable *GV, bool PrintDeclarationOnly = false);
++  void printAlias(const GlobalAlias *GA);
++  void printIFunc(const GlobalIFunc *GI);
++  void printComdat(const Comdat *C);
++  void printRequisiteDeclarations(const Function *F);
++  void printFunction(const Function *F, bool PrintCompleteIR = false,
++                     bool PrintDeclarationOnly = false);
++#else
+   void printGlobal(const GlobalVariable *GV);
+   void printAlias(const GlobalAlias *GA);
+   void printIFunc(const GlobalIFunc *GI);
+   void printComdat(const Comdat *C);
+   void printFunction(const Function *F);
++#endif
+   void printArgument(const Argument *FA, AttributeSet Attrs);
+   void printBasicBlock(const BasicBlock *BB);
+   void printInstructionLine(const Instruction &I);
+@@ -3593,15 +3603,26 @@ static void maybePrintComdat(formatted_raw_ostream &Out,
+   Out << ')';
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++void AssemblyWriter::printGlobal(const GlobalVariable *GV,
++                                 bool PrintDeclarationOnly) {
++  if (GV->isMaterializable() && !PrintDeclarationOnly)
++#else
+ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
+   if (GV->isMaterializable())
++#endif
+     Out << "; Materializable\n";
+ 
+   AsmWriterContext WriterCtx(&TypePrinter, &Machine, GV->getParent());
+   WriteAsOperandInternal(Out, GV, WriterCtx);
+   Out << " = ";
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if ((!GV->hasInitializer() || PrintDeclarationOnly) &&
++      GV->hasExternalLinkage())
++#else
+   if (!GV->hasInitializer() && GV->hasExternalLinkage())
++#endif
+     Out << "external ";
+ 
+   Out << getLinkageNameWithSpace(GV->getLinkage());
+@@ -3619,7 +3640,11 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
+   Out << (GV->isConstant() ? "constant " : "global ");
+   TypePrinter.print(GV->getValueType(), Out);
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (GV->hasInitializer() && !PrintDeclarationOnly) {
++#else
+   if (GV->hasInitializer()) {
++#endif
+     Out << ' ';
+     writeOperand(GV->getInitializer(), false);
+   }
+@@ -3769,12 +3794,102 @@ void AssemblyWriter::printTypeIdentities() {
+   }
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++/// printRequisiteDeclarations - Print the declarations of type identities,
++/// global variables, functions, and function attribute groups of a function.
++void AssemblyWriter::printRequisiteDeclarations(const Function *F) {
++  // walk through instructions and collect global variables & functions
++  SmallPtrSet<GlobalVariable *, 8> GVs;
++  SmallPtrSet<Function *, 8> Functions;
++  for (const BasicBlock &BB : *F) {
++    for (const Instruction &I : BB) {
++      // Check for function
++      if (const auto *CI = dyn_cast<CallInst>(&I)) {
++        Function *func = CI->getCalledFunction();
++        if (func)
++          Functions.insert(func);
++      }
++      // Check for global variables
++      for (const Use &U : I.operands()) {
++        if (GlobalVariable *gv = dyn_cast<GlobalVariable>(U))
++          GVs.insert(gv);
++        if (GEPOperator *gepo = dyn_cast<GEPOperator>(&U)) {
++          if (GlobalVariable *gv =
++                  dyn_cast<GlobalVariable>(gepo->getPointerOperand()))
++            GVs.insert(gv);
++          for (auto it = gepo->idx_begin(), et = gepo->idx_end(); it != et;
++               ++it) {
++            if (GlobalVariable *gv = dyn_cast<GlobalVariable>(*it))
++              GVs.insert(gv);
++          }
++        }
++      }
++    }
++  }
++
++  // print type identities
++  printTypeIdentities();
++
++  // print global variables
++  if (!GVs.empty()) {
++    Out << '\n';
++    for (auto GVit = GVs.begin(), et = GVs.end(); GVit != et; ++GVit) {
++      // Make backups of some properties. They may be modified for printing.
++      GlobalValue::LinkageTypes SavedLinkage = (*GVit)->getLinkage();
++      GlobalVariable::VisibilityTypes SavedVisibility =
++          (*GVit)->getVisibility();
++
++      // modify property if needed
++      if (!(*GVit)->hasAvailableExternallyLinkage() &&
++          !((*GVit)->getName() == "llvm.global_ctors") &&
++          (*GVit)->hasLocalLinkage()) {
++        (*GVit)->setLinkage(GlobalValue::ExternalLinkage);
++        (*GVit)->setVisibility(GlobalValue::HiddenVisibility);
++      }
++
++      printGlobal(*GVit, true);
++      Out << '\n';
++
++      // restore backups
++      (*GVit)->setLinkage(SavedLinkage);
++      (*GVit)->setVisibility(SavedVisibility);
++    }
++    Out << '\n';
++  }
++
++  // print functions
++  for (auto FuncIt = Functions.begin(), et = Functions.end(); FuncIt != et;
++       ++FuncIt) {
++    Out << '\n';
++    printFunction(*FuncIt, false, true);
++  }
++
++  // Write attribute groups.
++  if (!Machine.as_empty()) {
++    Out << '\n';
++    writeAllAttributeGroups();
++  }
++  Out << '\n';
++}
++
+ /// printFunction - Print all aspects of a function.
++void AssemblyWriter::printFunction(const Function *F, bool PrintCompleteIR,
++                                   bool PrintDeclarationOnly) {
++  if (PrintCompleteIR && !PrintDeclarationOnly) {
++    printRequisiteDeclarations(F);
++  }
++  if (AnnotationWriter && !PrintDeclarationOnly)
++    AnnotationWriter->emitFunctionAnnot(F, Out);
++
++  if (F->isMaterializable() && !PrintDeclarationOnly)
++    Out << "; Materializable\n";
++#else
+ void AssemblyWriter::printFunction(const Function *F) {
+   if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out);
+ 
+   if (F->isMaterializable())
+     Out << "; Materializable\n";
++#endif
+ 
+   const AttributeList &Attrs = F->getAttributes();
+   if (Attrs.hasFnAttrs()) {
+@@ -3792,6 +3907,18 @@ void AssemblyWriter::printFunction(const Function *F) {
+       Out << "; Function Attrs: " << AttrStr << '\n';
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (!PrintDeclarationOnly)
++    Machine.incorporateFunction(F);
++
++  if (F->isDeclaration() || PrintDeclarationOnly) {
++    Out << "declare";
++    if (!PrintDeclarationOnly) {
++      SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
++      F->getAllMetadata(MDs);
++      printMetadataAttachments(MDs, " ");
++    }
++#else
+   Machine.incorporateFunction(F);
+ 
+   if (F->isDeclaration()) {
+@@ -3799,6 +3926,7 @@ void AssemblyWriter::printFunction(const Function *F) {
+     SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+     F->getAllMetadata(MDs);
+     printMetadataAttachments(MDs, " ");
++#endif
+     Out << ' ';
+   } else
+     Out << "define ";
+@@ -3824,7 +3952,11 @@ void AssemblyWriter::printFunction(const Function *F) {
+   Out << '(';
+ 
+   // Loop over the arguments, printing them...
++#if defined(ENABLE_AUTOTUNER)
++  if ((F->isDeclaration() && !IsForDebug) || PrintDeclarationOnly) {
++#else
+   if (F->isDeclaration() && !IsForDebug) {
++#endif
+     // We're only interested in the type here - don't print argument names.
+     for (unsigned I = 0, E = FT->getNumParams(); I != E; ++I) {
+       // Insert commas as we go... the first arg doesn't get a comma
+@@ -3895,7 +4027,11 @@ void AssemblyWriter::printFunction(const Function *F) {
+     writeOperand(F->getPersonalityFn(), /*PrintType=*/true);
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (F->isDeclaration() || PrintDeclarationOnly) {
++#else
+   if (F->isDeclaration()) {
++#endif
+     Out << '\n';
+   } else {
+     SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+@@ -3913,6 +4049,13 @@ void AssemblyWriter::printFunction(const Function *F) {
+     Out << "}\n";
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // Output metadata
++  if (!Machine.mdn_empty() && PrintCompleteIR && !PrintDeclarationOnly) {
++    Out << '\n';
++    writeAllMDNodes();
++  }
++#endif
+   Machine.purgeFunction();
+ }
+ 
+@@ -4591,13 +4734,21 @@ void AssemblyWriter::printUseLists(const Function *F) {
+ 
+ void Function::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
+                      bool ShouldPreserveUseListOrder,
++#if defined(ENABLE_AUTOTUNER)
++                     bool IsForDebug, bool PrintCompleteIR) const {
++#else
+                      bool IsForDebug) const {
++#endif
+   SlotTracker SlotTable(this->getParent());
+   formatted_raw_ostream OS(ROS);
+   AssemblyWriter W(OS, SlotTable, this->getParent(), AAW,
+                    IsForDebug,
+                    ShouldPreserveUseListOrder);
++#if defined(ENABLE_AUTOTUNER)
++  W.printFunction(this, PrintCompleteIR);
++#else
+   W.printFunction(this);
++#endif
+ }
+ 
+ void BasicBlock::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
+diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt
+index 217fe703dd4e..d44d1eea9f3e 100644
+--- a/llvm/lib/IR/CMakeLists.txt
++++ b/llvm/lib/IR/CMakeLists.txt
+@@ -78,6 +78,7 @@ add_llvm_component_library(LLVMCore
+   intrinsics_gen
+ 
+   LINK_COMPONENTS
++  AutoTuner
+   BinaryFormat
+   Demangle
+   Remarks
+diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
+index 435800d9e5f9..ec2620efac38 100644
+--- a/llvm/lib/IR/Function.cpp
++++ b/llvm/lib/IR/Function.cpp
+@@ -70,6 +70,10 @@
+ #include <cstring>
+ #include <string>
+ 
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/IR/StructuralHash.h"
++#endif
++
+ using namespace llvm;
+ using ProfileCount = Function::ProfileCount;
+ 
+@@ -1977,6 +1981,36 @@ std::optional<StringRef> Function::getSectionPrefix() const {
+   return std::nullopt;
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++uint64_t AutoTuningEnabledFunction::computeStructuralHash() {
++  return StructuralHash(*(this->Func));
++}
++
++void AutoTuningEnabledFunction::initCodeRegion() {
++  StringRef FuncName = Func->getName();
++  StringRef EntryBBName;
++  autotuning::SourceLocation Loc;
++
++  if (!Func->empty())
++    EntryBBName = Func->front().getName();
++  else
++    EntryBBName = StringRef("None");
++
++  DISubprogram *SubProgram = Func->getSubprogram();
++  if (SubProgram)
++    // Set the column number to 0 because there is no information about
++    // column number for functions.
++    Loc = {SubProgram->getFilename().str(), SubProgram->getLine(), 0};
++
++  autotuning::CodeRegion CR =
++      autotuning::CodeRegion(EntryBBName.data(), FuncName.data(),
++                             autotuning::CodeRegionType::Function, Loc);
++  CR.setSize(Func->getInstructionCount());
++  CR.setHotness(this->getHotness());
++  this->setCodeRegion(CR);
++}
++#endif
++
+ bool Function::nullPointerIsDefined() const {
+   return hasFnAttribute(Attribute::NullPointerIsValid);
+ }
+diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
+index cb0ac0f8eae6..e614285df07a 100644
+--- a/llvm/lib/IR/Instructions.cpp
++++ b/llvm/lib/IR/Instructions.cpp
+@@ -45,6 +45,9 @@
+ #include <cstdint>
+ #include <optional>
+ #include <vector>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/IR/StructuralHash.h"
++#endif
+ 
+ using namespace llvm;
+ 
+@@ -259,6 +262,89 @@ void LandingPadInst::addClause(Constant *Val) {
+   getOperandList()[OpNo] = Val;
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++uint64_t AutoTuningEnabledSwitchInst::computeStructuralHash() {
++  return StructuralHash(*(this->SI));
++}
++
++void AutoTuningEnabledSwitchInst::initCodeRegion() {
++  std::string SwitchName;
++  if (this->SI->hasName()) {
++    SwitchName = this->SI->getName().str();
++  } else {
++    std::string Str;
++    llvm::raw_string_ostream RSO(Str);
++    this->SI->getCondition()->printAsOperand(RSO);
++    SwitchName = RSO.str();
++  }
++
++  autotuning::CodeRegion CR = autotuning::CodeRegion(
++      SwitchName, this->SI->getFunction()->getName().str(),
++      autotuning::CodeRegionType::Switch, this->SI->getDebugLoc());
++
++  unsigned TotalNumInsts = 0;
++  for (auto Case : SI->cases()) {
++    const BasicBlock *BB = Case.getCaseSuccessor();
++    unsigned NumInsts = std::distance(BB->instructionsWithoutDebug().begin(),
++                                      BB->instructionsWithoutDebug().end());
++    TotalNumInsts += NumInsts;
++  }
++
++  CR.setSize(TotalNumInsts);
++  // Compute hotness.
++  autotuning::HotnessType Hotness =
++      this->SI->getFunction()->ATEFunction.getHotness();
++  CR.setHotness(Hotness);
++
++  this->setCodeRegion(CR);
++}
++
++uint64_t AutoTuningEnabledCallSite::computeStructuralHash() {
++  return StructuralHash(*(this->CB));
++}
++
++void AutoTuningEnabledCallSite::initCodeRegion() {
++  // Use Caller's name as FuncName and Callee's name as Name of a CodeRegion.
++  Function *Caller = this->CB->getCaller();
++  Function *Callee = this->CB->getCalledFunction();
++  if (Caller == nullptr || Callee == nullptr) {
++    this->setCodeRegion(autotuning::CodeRegion::getInvalidInstance());
++    return;
++  }
++
++  autotuning::SourceLocation SrcLoc;
++  if (this->CB->getDebugLoc()) {
++    unsigned int SourceLine = this->CB->getDebugLoc()->getLine();
++    // Get modified source line number for current callsite if there is another
++    // call instruction (to same callee) which has same source line number
++    // happened due to inlining.
++    std::optional<unsigned int> LineNum = autotuning::Engine.getCallSiteLoc(CB);
++    if (LineNum)
++      SourceLine = *LineNum;
++    SrcLoc = autotuning::SourceLocation{
++        this->CB->getDebugLoc()->getFilename().str(), SourceLine,
++        this->CB->getDebugLoc()->getColumn()};
++  }
++
++  // We are using DebugLoc to distinguish between multiple calls to the same
++  // callee in a function. It may be possible that these multiple calls have
++  // same DebugLoc either 1) due to inlining of multiple calls (same callee)
++  // and callee having more calls, or 2) cloned calls added by previous
++  // optimizations. We are using 'callee name + it's parent (basic block) name'
++  // to solve these problems. Additionally we are using modified line number
++  // for the issue # 1; this will handle the cases where the multiple calls are
++  // in the same basic block.
++  autotuning::CodeRegion CR = autotuning::CodeRegion(
++      Callee->getName().str() + "-" + this->CB->getParent()->getName().str(),
++      Caller->getName().data(), autotuning::CodeRegionType::CallSite, SrcLoc,
++      autotuning::DynamicOptions{{"ForceInline", {0, 1}}});
++
++  CR.setSize(Callee->getInstructionCount());
++  CR.setHotness(Caller->ATEFunction.getHotness());
++  this->setCodeRegion(CR);
++}
++#endif
++
+ //===----------------------------------------------------------------------===//
+ //                        CallBase Implementation
+ //===----------------------------------------------------------------------===//
+diff --git a/llvm/lib/IR/StructuralHash.cpp b/llvm/lib/IR/StructuralHash.cpp
+index 6ea108d831a1..1583e1c82b3e 100644
+--- a/llvm/lib/IR/StructuralHash.cpp
++++ b/llvm/lib/IR/StructuralHash.cpp
+@@ -10,9 +10,23 @@
+ #include "llvm/IR/Function.h"
+ #include "llvm/IR/GlobalVariable.h"
+ #include "llvm/IR/Module.h"
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/CodeGen/MachineBasicBlock.h"
++#include "llvm/IR/InstrTypes.h"
++#include "llvm/IR/Instructions.h"
++#include "llvm/Support/CommandLine.h"
++#endif
+ 
+ using namespace llvm;
+ 
++#if defined(ENABLE_AUTOTUNER)
++// AutoTuner Flag to use callsite Debug Location for hash cacluation.
++static cl::opt<bool> HashCallSite(
++    "hash-prior-to-callsite", cl::init(true), cl::Hidden,
++    cl::desc("Use function IR prior to a call site to compute the hashcode for"
++             " the call site"));
++#endif
++
+ namespace {
+ 
+ // Basic hashing mechanism to detect structural change to the IR, used to verify
+@@ -21,16 +35,81 @@ namespace {
+ 
+ class StructuralHashImpl {
+   hash_code Hash;
++#if defined(ENABLE_AUTOTUNER)
++  const uint64_t BLOCK_HEADER_HASH = 45798;
++#endif
+ 
+   template <typename T> void hash(const T &V) { Hash = hash_combine(Hash, V); }
+ 
+ public:
+   StructuralHashImpl() : Hash(4) {}
+ 
++#if defined(ENABLE_AUTOTUNER)
++  void update(const MachineBasicBlock &MBB) {
++    // Update the structural hash when we encounter a new basic block.
++    // Prevents CodeRegions with different structures, but many empty
++    // BasicBlocks to have the same structural hash.
++    if (const BasicBlock *Block = MBB.getBasicBlock()) {
++      hash(BLOCK_HEADER_HASH); // Block header
++      for (auto &Inst : *Block)
++        hash(Inst.getOpcode());
++    }
++  }
++
++  void update(const std::vector<BasicBlock *> BBs) {
++    // Update the structural hash when we encounter a new basic block.
++    // Prevents CodeRegions with different structures, but many empty
++    // BasicBlocks to have the same structural hash.
++    for (BasicBlock *BB : BBs) {
++      if (BB == nullptr)
++        continue;
++
++      hash(BLOCK_HEADER_HASH); // Block header
++      for (auto &Inst : *BB)
++        hash(Inst.getOpcode());
++    }
++  }
++
++  void update(const llvm::CallBase &CB) {
++    StringRef Name = "";
++    if (HashCallSite) {
++      update(*CB.getCaller(), std::addressof(CB));
++    } else {
++      const Function &F = *CB.getCaller();
++      Name = F.getName();
++      std::string FileName = Name.str();
++      for (uint64_t Idx = 0; Idx < Name.size(); Idx = Idx + sizeof(uint64_t)) {
++        uint64_t Value = 0;
++        FileName.copy((char *)&Value, sizeof(uint64_t), Idx);
++        hash(Value);
++      }
++    }
++
++    update(*CB.getCalledFunction());
++  }
++
++  void update(const SwitchInst &SI) {
++    hash(SI.getNumCases());
++    for (auto Case : SI.cases()) {
++      hash(BLOCK_HEADER_HASH);
++      const BasicBlock *BB = Case.getCaseSuccessor();
++      for (auto &Inst : *BB)
++        hash(Inst.getOpcode());
++    }
++  }
++
++  void update(const Function &F, const CallBase *TargetCB = nullptr) {
++    if (F.isDeclaration())
++      return;
++
++    const Instruction *I =
++        TargetCB ? (dyn_cast<Instruction>(TargetCB)) : nullptr;
++#else
+   void update(const Function &F) {
+     // Declarations don't affect analyses.
+     if (F.isDeclaration())
+       return;
++#endif
+ 
+     hash(12345); // Function header
+ 
+@@ -44,9 +123,18 @@ public:
+     VisitedBBs.insert(BBs[0]);
+     while (!BBs.empty()) {
+       const BasicBlock *BB = BBs.pop_back_val();
++#if defined(ENABLE_AUTOTUNER)
++      hash(BLOCK_HEADER_HASH); // Block header
++      for (auto &Inst : *BB) {
++        hash(Inst.getOpcode());
++        if (I && Inst.isIdenticalTo(I))
++          return;
++      }
++#else
+       hash(45798); // Block header
+       for (auto &Inst : *BB)
+         hash(Inst.getOpcode());
++#endif
+ 
+       const Instruction *Term = BB->getTerminator();
+       for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+@@ -79,6 +167,32 @@ public:
+ 
+ } // namespace
+ 
++#if defined(ENABLE_AUTOTUNER)
++uint64_t llvm::StructuralHash(const MachineBasicBlock &MBB) {
++  StructuralHashImpl H;
++  H.update(MBB);
++  return H.getHash();
++}
++
++uint64_t llvm::StructuralHash(const std::vector<BasicBlock *> BBs) {
++  StructuralHashImpl H;
++  H.update(BBs);
++  return H.getHash();
++}
++
++uint64_t llvm::StructuralHash(const CallBase &CB) {
++  StructuralHashImpl H;
++  H.update(CB);
++  return H.getHash();
++}
++
++uint64_t llvm::StructuralHash(const SwitchInst &SI) {
++  StructuralHashImpl H;
++  H.update(SI);
++  return H.getHash();
++}
++#endif
++
+ uint64_t llvm::StructuralHash(const Function &F) {
+   StructuralHashImpl H;
+   H.update(F);
+diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
+index d0cbbcc0e310..a3ccbc6d258f 100644
+--- a/llvm/lib/Passes/PassBuilder.cpp
++++ b/llvm/lib/Passes/PassBuilder.cpp
+@@ -262,6 +262,11 @@
+ #include "llvm/Transforms/Vectorize/VectorCombine.h"
+ #include <optional>
+ 
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/Analysis/AutotuningDump.h"
++#include "llvm/Transforms/Scalar/AutoTuningCompile.h"
++#endif
++
+ using namespace llvm;
+ 
+ static const Regex DefaultAliasRegex(
+diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
+index 660cb2e974d7..8009e011833c 100644
+--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
++++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
+@@ -133,6 +133,11 @@
+ #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
+ #include "llvm/Transforms/Vectorize/VectorCombine.h"
+ 
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/Transforms/Scalar/AutoTuningCompile.h"
++#endif
++
+ using namespace llvm;
+ 
+ static cl::opt<InliningAdvisorMode> UseInlineAdvisor(
+@@ -289,6 +294,10 @@ PipelineTuningOptions::PipelineTuningOptions() {
+   EagerlyInvalidateAnalyses = EnableEagerlyInvalidateAnalyses;
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++extern cl::opt<AutoTuningCompileOpt> AutoTuningCompileMode;
++#endif
++
+ namespace llvm {
+ extern cl::opt<unsigned> MaxDevirtIterations;
+ extern cl::opt<bool> EnableKnowledgeRetention;
+@@ -452,9 +461,17 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
+   // attention to it.
+   if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
+       PGOOpt->Action != PGOOptions::SampleUse)
++#if defined(ENABLE_AUTOTUNER)
++  {
++    if (AutoTuningCompileMode)
++      LPM2.addPass(AutoTuningCompileLoopPass(autotuning::CompileOptionUnroll));
++#endif
+     LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
+                                     /* OnlyWhenForced= */ !PTO.LoopUnrolling,
+                                     PTO.ForgetAllSCEVInLoopUnroll));
++#if defined(ENABLE_AUTOTUNER)
++  }
++#endif
+ 
+   invokeLoopOptimizerEndEPCallbacks(LPM2, Level);
+ 
+@@ -631,9 +648,17 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
+   // attention to it.
+   if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
+       PGOOpt->Action != PGOOptions::SampleUse)
++#if defined(ENABLE_AUTOTUNER)
++  {
++    if (AutoTuningCompileMode)
++      LPM2.addPass(AutoTuningCompileLoopPass(autotuning::CompileOptionUnroll));
++#endif
+     LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
+                                     /* OnlyWhenForced= */ !PTO.LoopUnrolling,
+                                     PTO.ForgetAllSCEVInLoopUnroll));
++#if defined(ENABLE_AUTOTUNER)
++  }
++#endif
+ 
+   invokeLoopOptimizerEndEPCallbacks(LPM2, Level);
+ 
+@@ -1110,6 +1135,11 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
+   if (EnableSyntheticCounts && !PGOOpt)
+     MPM.addPass(SyntheticCountsPropagation());
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (AutoTuningCompileMode)
++    MPM.addPass(AutoTuningCompileModulePass(autotuning::CompileOptionInline));
++#endif
++
+   if (EnableModuleInliner)
+     MPM.addPass(buildModuleInlinerPipeline(Level, Phase));
+   else
+@@ -1131,6 +1161,12 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
+ /// TODO: Should LTO cause any differences to this set of passes?
+ void PassBuilder::addVectorPasses(OptimizationLevel Level,
+                                   FunctionPassManager &FPM, bool IsFullLTO) {
++#if defined(ENABLE_AUTOTUNER)
++  if (AutoTuningCompileMode && !IsFullLTO)
++    FPM.addPass(
++        AutoTuningCompileFunctionPass(autotuning::CompileOptionVectorize));
++#endif
++
+   FPM.addPass(LoopVectorizePass(
+       LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
+ 
+@@ -1444,6 +1480,10 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
+     return buildO0DefaultPipeline(Level, LTOPreLink);
+ 
+   ModulePassManager MPM;
++#if defined(ENABLE_AUTOTUNER)
++  if (AutoTuningCompileMode)
++    MPM.addPass(AutoTuningCompileModulePass(autotuning::CompileOptionStart));
++#endif
+ 
+   // Convert @llvm.global.annotations to !annotation metadata.
+   MPM.addPass(Annotation2MetadataPass());
+@@ -1475,6 +1515,12 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
+ 
+   if (LTOPreLink)
+     addRequiredLTOPreLinkPasses(MPM);
++
++#if defined(ENABLE_AUTOTUNER)
++  if (AutoTuningCompileMode)
++    MPM.addPass(AutoTuningCompileModulePass(autotuning::CompileOptionEnd));
++#endif
++
+   return MPM;
+ }
+ 
+diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
+index e10dc995c493..45a539f14b93 100644
+--- a/llvm/lib/Passes/PassRegistry.def
++++ b/llvm/lib/Passes/PassRegistry.def
+@@ -29,6 +29,10 @@ MODULE_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
+ MODULE_ANALYSIS("inline-advisor", InlineAdvisorAnalysis())
+ MODULE_ANALYSIS("ir-similarity", IRSimilarityAnalysis())
+ 
++#if defined(ENABLE_AUTOTUNER)
++MODULE_ANALYSIS("autotuning-dump", AutotuningDumpAnalysis())
++#endif
++
+ #ifndef MODULE_ALIAS_ANALYSIS
+ #define MODULE_ALIAS_ANALYSIS(NAME, CREATE_PASS)                               \
+   MODULE_ANALYSIS(NAME, CREATE_PASS)
+@@ -127,6 +131,9 @@ MODULE_PASS("sanmd-module", SanitizerBinaryMetadataPass())
+ MODULE_PASS("memprof-module", ModuleMemProfilerPass())
+ MODULE_PASS("poison-checking", PoisonCheckingPass())
+ MODULE_PASS("pseudo-probe-update", PseudoProbeUpdatePass())
++#if defined(ENABLE_AUTOTUNER)
++MODULE_PASS("autotuning-compile-module", AutoTuningCompileModulePass())
++#endif
+ #undef MODULE_PASS
+ 
+ #ifndef MODULE_PASS_WITH_PARAMS
+@@ -430,6 +437,9 @@ FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass())
+ FUNCTION_PASS("tsan", ThreadSanitizerPass())
+ FUNCTION_PASS("memprof", MemProfilerPass())
+ FUNCTION_PASS("declare-to-assign", llvm::AssignmentTrackingPass())
++#if defined(ENABLE_AUTOTUNER)
++FUNCTION_PASS("autotuning-compile-function", AutoTuningCompileFunctionPass())
++#endif
+ #undef FUNCTION_PASS
+ 
+ #ifndef FUNCTION_PASS_WITH_PARAMS
+@@ -614,6 +624,9 @@ LOOP_PASS("guard-widening", GuardWideningPass())
+ LOOP_PASS("loop-bound-split", LoopBoundSplitPass())
+ LOOP_PASS("loop-reroll", LoopRerollPass())
+ LOOP_PASS("loop-versioning-licm", LoopVersioningLICMPass())
++#if defined(ENABLE_AUTOTUNER)
++LOOP_PASS("autotuning-compile-loop", AutoTuningCompileLoopPass())
++#endif
+ #undef LOOP_PASS
+ 
+ #ifndef LOOP_PASS_WITH_PARAMS
+diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
+index 7eef511928ec..8653027ceed2 100644
+--- a/llvm/lib/Passes/StandardInstrumentations.cpp
++++ b/llvm/lib/Passes/StandardInstrumentations.cpp
+@@ -41,6 +41,10 @@
+ #include <unordered_set>
+ #include <utility>
+ #include <vector>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/Transforms/Scalar/AutoTuningCompile.h"
++#endif
+ 
+ using namespace llvm;
+ 
+@@ -107,6 +111,10 @@ static cl::opt<bool> PrintOnCrash(
+     cl::desc("Print the last form of the IR before crash (use -print-on-crash-path to dump to a file)"),
+     cl::Hidden);
+ 
++#if defined(ENABLE_AUTOTUNER)
++extern cl::opt<AutoTuningCompileOpt> AutoTuningCompileMode;
++#endif
++
+ static cl::opt<std::string> OptBisectPrintIRPath(
+     "opt-bisect-print-ir-path",
+     cl::desc("Print IR to path when opt-bisect-limit is reached"), cl::Hidden);
+@@ -874,6 +882,21 @@ bool OptPassGateInstrumentation::shouldRun(StringRef PassName, Any IR) {
+ 
+ void OptPassGateInstrumentation::registerCallbacks(
+     PassInstrumentationCallbacks &PIC) {
++#if defined(ENABLE_AUTOTUNER)
++  // Using AutoTuner OptBisect to change the behavior of compilation pipeline.
++  // Flag 'opt-bisect-limit' will be preferred if both 'opt-bisect-limit' and
++  // incremental compilation flags are used.
++  if (autotuning::Engine.isParseInput() && AutoTuningCompileMode) {
++    if (!getAutoTuningOptPassGate().isEnabled())
++      return;
++
++    PIC.registerShouldRunOptionalPassCallback([](StringRef PassID, Any IR) {
++      return isIgnored(PassID) ||
++             getAutoTuningOptPassGate().checkPass(PassID, getIRName(IR));
++    });
++    return;
++  }
++#endif
+   OptPassGate &PassGate = Context.getOptPassGate();
+   if (!PassGate.isEnabled())
+     return;
+diff --git a/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp b/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp
+index b2627196bce6..b1dfa9d0f2cf 100644
+--- a/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp
++++ b/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp
+@@ -277,6 +277,14 @@ void BitstreamRemarkSerializerHelper::emitRemarkBlock(const Remark &Remark,
+   R.push_back(StrTab.add(Remark.RemarkName).first);
+   R.push_back(StrTab.add(Remark.PassName).first);
+   R.push_back(StrTab.add(Remark.FunctionName).first);
++#if defined(ENABLE_AUTOTUNER)
++  if (Remark.CodeRegionType)
++    R.push_back(StrTab.add(*Remark.CodeRegionType).first);
++  if (std::optional<uint64_t> hash = Remark.CodeRegionHash)
++    R.push_back(*hash);
++  if (std::optional<unsigned int> Invocation = Remark.Invocation)
++    R.push_back(*Invocation);
++#endif
+   Bitstream.EmitRecordWithAbbrev(RecordRemarkHeaderAbbrevID, R);
+ 
+   if (const std::optional<RemarkLocation> &Loc = Remark.Loc) {
+diff --git a/llvm/lib/Remarks/RemarkStreamer.cpp b/llvm/lib/Remarks/RemarkStreamer.cpp
+index 9f4676ce37ab..d1faf4f1553a 100644
+--- a/llvm/lib/Remarks/RemarkStreamer.cpp
++++ b/llvm/lib/Remarks/RemarkStreamer.cpp
+@@ -14,6 +14,10 @@
+ #include "llvm/Support/CommandLine.h"
+ #include <optional>
+ 
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/IR/DebugInfoMetadata.h"
++#endif
++
+ using namespace llvm;
+ using namespace llvm::remarks;
+ 
+diff --git a/llvm/lib/Remarks/YAMLRemarkParser.cpp b/llvm/lib/Remarks/YAMLRemarkParser.cpp
+index f5123b0f64ce..baa393c6a619 100644
+--- a/llvm/lib/Remarks/YAMLRemarkParser.cpp
++++ b/llvm/lib/Remarks/YAMLRemarkParser.cpp
+@@ -17,10 +17,23 @@
+ #include "llvm/Support/Endian.h"
+ #include "llvm/Support/Path.h"
+ #include <optional>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/Support/CommandLine.h"
++#endif
+ 
+ using namespace llvm;
+ using namespace llvm::remarks;
+ 
++#if defined(ENABLE_AUTOTUNER)
++// Creating code regions without meta data (e.g. debug Location, Function Name,
++// etc.).
++// This flag is added here instead of 'lib/AutoTuner/AutoTuning.cpp' to avoid
++// making LLVMRemarks dependent on LLVMCore.
++cl::opt<bool> OmitAutotuningMetadata(
++    "auto-tuning-omit-metadata", cl::Hidden, cl::init(false),
++    cl::desc("Include only code region hashes and types in opportunity files"));
++#endif
++
+ char YAMLParseError::ID = 0;
+ 
+ static void handleDiagnostic(const SMDiagnostic &Diag, void *Ctx) {
+@@ -235,6 +248,23 @@ YAMLRemarkParser::parseRemark(yaml::Document &RemarkEntry) {
+         TheRemark.FunctionName = *MaybeStr;
+       else
+         return MaybeStr.takeError();
++#if defined(ENABLE_AUTOTUNER)
++    } else if (KeyName == "CodeRegionType") {
++      if (Expected<StringRef> MaybeStr = parseStr(RemarkField))
++        TheRemark.CodeRegionType = *MaybeStr;
++      else
++        return MaybeStr.takeError();
++    } else if (KeyName == "CodeRegionHash") {
++      if (Expected<uint64_t> MaybeULL = parseUnsignedLL(RemarkField))
++        TheRemark.CodeRegionHash = *MaybeULL;
++      else
++        return MaybeULL.takeError();
++    } else if (KeyName == "Invocation") {
++      if (Expected<unsigned int> MaybeULL = parseUnsignedLL(RemarkField))
++        TheRemark.Invocation = *MaybeULL;
++      else
++        return MaybeULL.takeError();
++#endif
+     } else if (KeyName == "Hotness") {
+       if (Expected<unsigned> MaybeU = parseUnsigned(RemarkField))
+         TheRemark.Hotness = *MaybeU;
+@@ -261,11 +291,35 @@ YAMLRemarkParser::parseRemark(yaml::Document &RemarkEntry) {
+     }
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // Check if any of the mandatory fields are missing.
++  if (TheRemark.RemarkType == Type::AutoTuning) {
++    // We expect type, and pass to be present at least.
++    if (!TheRemark.CodeRegionType || TheRemark.PassName.empty())
++      return error("CodeRegionHash, CodeRegionType, or Pass missing.",
++                   *RemarkEntry.getRoot());
++
++    // Sanity check for the correct command line option.
++    if (!OmitAutotuningMetadata && TheRemark.RemarkName.empty())
++      return error("Remark Name expected; enable -autotuning-omit-metadata.",
++                   *RemarkEntry.getRoot());
++
++    if (!OmitAutotuningMetadata && TheRemark.FunctionName.empty())
++      return error(
++          "Remark Function Name expected; enable -autotuning-omit-metadata.",
++          *RemarkEntry.getRoot());
++  } else if (TheRemark.RemarkType == Type::Unknown ||
++             TheRemark.PassName.empty() || TheRemark.RemarkName.empty() ||
++             TheRemark.FunctionName.empty())
++    return error("Type, Pass, Name or Function missing.",
++                 *RemarkEntry.getRoot());
++#else
+   // Check if any of the mandatory fields are missing.
+   if (TheRemark.RemarkType == Type::Unknown || TheRemark.PassName.empty() ||
+       TheRemark.RemarkName.empty() || TheRemark.FunctionName.empty())
+     return error("Type, Pass, Name or Function missing.",
+                  *RemarkEntry.getRoot());
++#endif
+ 
+   return std::move(Result);
+ }
+@@ -277,6 +331,9 @@ Expected<Type> YAMLRemarkParser::parseType(yaml::MappingNode &Node) {
+                   .Case("!Analysis", remarks::Type::Analysis)
+                   .Case("!AnalysisFPCommute", remarks::Type::AnalysisFPCommute)
+                   .Case("!AnalysisAliasing", remarks::Type::AnalysisAliasing)
++#if defined(ENABLE_AUTOTUNER)
++                  .Case("!AutoTuning", remarks::Type::AutoTuning)
++#endif
+                   .Case("!Failure", remarks::Type::Failure)
+                   .Default(remarks::Type::Unknown);
+   if (Type == remarks::Type::Unknown)
+@@ -313,6 +370,31 @@ Expected<StringRef> YAMLRemarkParser::parseStr(yaml::KeyValueNode &Node) {
+   return Result;
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++Expected<std::vector<StringRef>>
++YAMLRemarkParser::parseStrVector(yaml::KeyValueNode &Node) {
++  std::vector<StringRef> Result;
++  auto *SequenceNode = dyn_cast<yaml::SequenceNode>(Node.getValue());
++  if (!SequenceNode)
++    return error("expected a value of sequence type.", Node);
++
++  for (yaml::Node &Element : *SequenceNode) {
++    auto *ScalarNode = dyn_cast<yaml::ScalarNode>(&Element);
++    if (!ScalarNode)
++      return error("expected a value of scalar type.", Element);
++    else {
++      StringRef Str = ScalarNode->getRawValue();
++      if (Str.front() == '\'')
++        Str = Str.drop_front();
++      if (Str.back() == '\'')
++        Str = Str.drop_back();
++      Result.push_back(Str);
++    }
++  }
++  return Result;
++}
++#endif
++
+ Expected<unsigned> YAMLRemarkParser::parseUnsigned(yaml::KeyValueNode &Node) {
+   SmallVector<char, 4> Tmp;
+   auto *Value = dyn_cast<yaml::ScalarNode>(Node.getValue());
+@@ -324,6 +406,19 @@ Expected<unsigned> YAMLRemarkParser::parseUnsigned(yaml::KeyValueNode &Node) {
+   return UnsignedValue;
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++Expected<uint64_t> YAMLRemarkParser::parseUnsignedLL(yaml::KeyValueNode &Node) {
++  SmallVector<char, 4> Tmp;
++  if (auto *Value = dyn_cast<yaml::ScalarNode>(Node.getValue())) {
++    uint64_t UnsignedValue = 0;
++    if (Value->getValue(Tmp).getAsInteger(10, UnsignedValue))
++      return error("expected a value of integer type.", *Value);
++    return UnsignedValue;
++  }
++  return error("expected a value of scalar type.", Node);
++}
++#endif
++
+ Expected<RemarkLocation>
+ YAMLRemarkParser::parseDebugLoc(yaml::KeyValueNode &Node) {
+   auto *DebugLoc = dyn_cast<yaml::MappingNode>(Node.getValue());
+@@ -374,6 +469,9 @@ Expected<Argument> YAMLRemarkParser::parseArg(yaml::Node &Node) {
+ 
+   std::optional<StringRef> KeyStr;
+   std::optional<StringRef> ValueStr;
++#if defined(ENABLE_AUTOTUNER)
++  std::optional<std::vector<StringRef>> ValueStrVector;
++#endif
+   std::optional<RemarkLocation> Loc;
+ 
+   for (yaml::KeyValueNode &ArgEntry : *ArgMap) {
+@@ -400,11 +498,27 @@ Expected<Argument> YAMLRemarkParser::parseArg(yaml::Node &Node) {
+     if (ValueStr)
+       return error("only one string entry is allowed per argument.", ArgEntry);
+ 
++#if defined(ENABLE_AUTOTUNER)
++    // Try to parse the value to a string vector.
++    if (Expected<std::vector<StringRef>> MaybeStrVector =
++            parseStrVector(ArgEntry)) {
++      ValueStrVector = *MaybeStrVector;
++      ValueStr = "";
++    } else {
++      consumeError(MaybeStrVector.takeError());
++      // Try to parse the value.
++      if (Expected<StringRef> MaybeStr = parseStr(ArgEntry))
++        ValueStr = *MaybeStr;
++      else
++        return MaybeStr.takeError();
++    }
++#else
+     // Try to parse the value.
+     if (Expected<StringRef> MaybeStr = parseStr(ArgEntry))
+       ValueStr = *MaybeStr;
+     else
+       return MaybeStr.takeError();
++#endif
+ 
+     // Keep the key from the string.
+     KeyStr = KeyName;
+@@ -412,10 +526,18 @@ Expected<Argument> YAMLRemarkParser::parseArg(yaml::Node &Node) {
+ 
+   if (!KeyStr)
+     return error("argument key is missing.", *ArgMap);
++#if defined(ENABLE_AUTOTUNER)
++  if (!ValueStr && !ValueStrVector)
++#else
+   if (!ValueStr)
++#endif
+     return error("argument value is missing.", *ArgMap);
+ 
++#if defined(ENABLE_AUTOTUNER)
++  return Argument{*KeyStr, *ValueStr, ValueStrVector, Loc};
++#else
+   return Argument{*KeyStr, *ValueStr, Loc};
++#endif
+ }
+ 
+ Expected<std::unique_ptr<Remark>> YAMLRemarkParser::next() {
+diff --git a/llvm/lib/Remarks/YAMLRemarkParser.h b/llvm/lib/Remarks/YAMLRemarkParser.h
+index 8ef72e16be74..141f10dd3900 100644
+--- a/llvm/lib/Remarks/YAMLRemarkParser.h
++++ b/llvm/lib/Remarks/YAMLRemarkParser.h
+@@ -91,6 +91,12 @@ protected:
+   Expected<RemarkLocation> parseDebugLoc(yaml::KeyValueNode &Node);
+   /// Parse an argument.
+   Expected<Argument> parseArg(yaml::Node &Node);
++#if defined(ENABLE_AUTOTUNER)
++  /// parse a vector of strings.
++  Expected<std::vector<StringRef>> parseStrVector(yaml::KeyValueNode &Node);
++  /// Parse one value to an unsigned long long.
++  Expected<uint64_t> parseUnsignedLL(yaml::KeyValueNode &Node);
++#endif
+ };
+ 
+ /// YAML with a string table to Remark parser.
+diff --git a/llvm/lib/Remarks/YAMLRemarkSerializer.cpp b/llvm/lib/Remarks/YAMLRemarkSerializer.cpp
+index 68285c3dde1b..1bc0f23f9221 100644
+--- a/llvm/lib/Remarks/YAMLRemarkSerializer.cpp
++++ b/llvm/lib/Remarks/YAMLRemarkSerializer.cpp
+@@ -15,10 +15,45 @@
+ #include "llvm/Remarks/Remark.h"
+ #include "llvm/Support/FileSystem.h"
+ #include <optional>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/Support/CommandLine.h"
++#endif
+ 
+ using namespace llvm;
+ using namespace llvm::remarks;
+ 
++#if defined(ENABLE_AUTOTUNER)
++extern cl::opt<bool> OmitAutotuningMetadata;
++
++// Use the same keys whether we use a string table or not (respectively, T is an
++// unsigned or a StringRef).
++template <typename T>
++static void mapRemarkHeader(
++    yaml::IO &io, T PassName, T RemarkName, std::optional<RemarkLocation> RL,
++    T FunctionName, std::optional<StringRef> CodeRegionType,
++    std::optional<uint64_t> CodeRegionHash,
++    std::optional<unsigned int> Invocation,
++    std::optional<std::map<std::string, std::string>> BaselineConfig,
++    std::optional<std::map<std::string, std::vector<unsigned int>>>
++        AutoTunerOptions,
++    std::optional<uint64_t> Hotness, ArrayRef<Argument> Args) {
++  io.mapRequired("Pass", PassName);
++  if (!OmitAutotuningMetadata) {
++    io.mapRequired("Name", RemarkName);
++    io.mapOptional("DebugLoc", RL);
++    io.mapRequired("Function", FunctionName);
++  }
++  io.mapOptional("CodeRegionType", CodeRegionType);
++  io.mapOptional("CodeRegionHash", CodeRegionHash);
++  io.mapOptional("DynamicConfigs", AutoTunerOptions);
++  io.mapOptional("BaselineConfig", BaselineConfig);
++  io.mapOptional("Invocation", Invocation);
++  if (!OmitAutotuningMetadata) {
++    io.mapOptional("Hotness", Hotness);
++    io.mapOptional("Args", Args);
++  }
++}
++#else
+ // Use the same keys whether we use a string table or not (respectively, T is an
+ // unsigned or a StringRef).
+ template <typename T>
+@@ -33,6 +68,7 @@ static void mapRemarkHeader(yaml::IO &io, T PassName, T RemarkName,
+   io.mapOptional("Hotness", Hotness);
+   io.mapOptional("Args", Args);
+ }
++#endif
+ 
+ namespace llvm {
+ namespace yaml {
+@@ -53,6 +89,10 @@ template <> struct MappingTraits<remarks::Remark *> {
+     else if (io.mapTag("!AnalysisAliasing",
+                        (Remark->RemarkType == Type::AnalysisAliasing)))
+       ;
++#if defined(ENABLE_AUTOTUNER)
++    else if (io.mapTag("!AutoTuning", (Remark->RemarkType == Type::AutoTuning)))
++      ;
++#endif
+     else if (io.mapTag("!Failure", (Remark->RemarkType == Type::Failure)))
+       ;
+     else
+@@ -66,14 +106,58 @@ template <> struct MappingTraits<remarks::Remark *> {
+       unsigned NameID = StrTab.add(Remark->RemarkName).first;
+       unsigned FunctionID = StrTab.add(Remark->FunctionName).first;
+       mapRemarkHeader(io, PassID, NameID, Remark->Loc, FunctionID,
++#if defined(ENABLE_AUTOTUNER)
++                      Remark->CodeRegionType, Remark->CodeRegionHash,
++                      Remark->Invocation, Remark->BaselineConfig,
++                      Remark->AutoTunerOptions, Remark->Hotness, Remark->Args);
++
++#else
+                       Remark->Hotness, Remark->Args);
++#endif
+     } else {
+       mapRemarkHeader(io, Remark->PassName, Remark->RemarkName, Remark->Loc,
++#if defined(ENABLE_AUTOTUNER)
++                      Remark->FunctionName, Remark->CodeRegionType,
++                      Remark->CodeRegionHash, Remark->Invocation,
++                      Remark->BaselineConfig, Remark->AutoTunerOptions,
++                      Remark->Hotness, Remark->Args);
++#else
+                       Remark->FunctionName, Remark->Hotness, Remark->Args);
++#endif
+     }
+   }
+ };
+ 
++#if defined(ENABLE_AUTOTUNER)
++// YAML I/O to support dumping 'Values: { key: [...], ... }' in opportunity
++// files.
++template <>
++struct MappingTraits<std::map<std::string, std::vector<unsigned int>>> {
++  static void mapping(IO &io,
++                      std::map<std::string, std::vector<unsigned int>> &OM) {
++    assert(io.outputting() && "input not yet implemented");
++
++    // Print as an abbreviated dictionary
++    llvm::yaml::StdMapStringCustomMappingTraitsImpl<
++        std::vector<unsigned int>>::output(io, OM);
++  }
++  // This sets the beginFlowMapping and endFlowMapping
++  static const bool flow = true;
++};
++
++template <> struct MappingTraits<std::map<std::string, std::string>> {
++  static void mapping(IO &io, std::map<std::string, std::string> &OM) {
++    assert(io.outputting() && "input not yet implemented");
++
++    // Print as an abbreviated dictionary
++    llvm::yaml::StdMapStringCustomMappingTraitsImpl<std::string>::output(io,
++                                                                         OM);
++  }
++  // This sets the beginFlowMapping and endFlowMapping
++  static const bool flow = true;
++};
++#endif
++
+ template <> struct MappingTraits<RemarkLocation> {
+   static void mapping(IO &io, RemarkLocation &RL) {
+     assert(io.outputting() && "input not yet implemented");
+diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
+index d3efb8b67be5..b66415c0e9a9 100644
+--- a/llvm/lib/Support/CommandLine.cpp
++++ b/llvm/lib/Support/CommandLine.cpp
+@@ -127,6 +127,9 @@ static inline bool isPrefixedOrGrouping(const Option *O) {
+          O->getFormattingFlag() == cl::AlwaysPrefix;
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++#include <map>
++#endif
+ 
+ namespace {
+ 
+@@ -1470,6 +1473,44 @@ bool cl::ParseCommandLineOptions(int argc, const char *const *argv,
+                                                Errs, LongOptionsUseDoubleDash);
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++bool cl::ParseAutoTunerOptions(
++    std::unordered_map<std::string, std::string> LLVMParams,
++    std::unordered_map<std::string, std::string> ProgramParams,
++    StringRef Overview, raw_ostream *Errs, const char *EnvVar,
++    bool LongOptionsUseDoubleDash) {
++  SmallVector<const char *, 20> NewArgv;
++  BumpPtrAllocator A;
++  StringSaver Saver(A);
++  // GlobalParser requires arguments similar to C style command line options
++  // (int argc, char * argv[]) where argv[0] refers to the program name.
++  // We are using a fake program name here which is consistent with LLVM.
++  NewArgv.push_back("AutoTuner (LLVM option parsing)");
++
++  for (const auto &I : LLVMParams) {
++    std::string NewOption = I.first + "=" + I.second;
++    NewArgv.push_back(Saver.save(NewOption).data());
++  }
++
++  for (const auto &I : ProgramParams) {
++    std::string NewOption = I.first + "=" + I.second;
++    NewArgv.push_back(Saver.save(NewOption).data());
++  }
++
++  // Parse options from environment variable.
++  if (EnvVar) {
++    if (std::optional<std::string> EnvValue =
++            sys::Process::GetEnv(StringRef(EnvVar)))
++      TokenizeGNUCommandLine(*EnvValue, Saver, NewArgv);
++  }
++
++  int NewArgc = static_cast<int>(NewArgv.size());
++  // Parse all options.
++  return GlobalParser->ParseCommandLineOptions(NewArgc, &NewArgv[0], Overview,
++                                               Errs, LongOptionsUseDoubleDash);
++}
++#endif
++
+ /// Reset all options at least once, so that we can parse different options.
+ void CommandLineParser::ResetAllOptionOccurrences() {
+   // Reset all option values to look like they have never been seen before.
+diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
+index 034f1587ae8d..3507d357a4c6 100644
+--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
++++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
+@@ -57,6 +57,7 @@ add_llvm_component_library(LLVMipo
+   LINK_COMPONENTS
+   AggressiveInstCombine
+   Analysis
++  AutoTuner
+   BitReader
+   BitWriter
+   Core
+diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
+index 3e00aebce372..802667819c44 100644
+--- a/llvm/lib/Transforms/IPO/Inliner.cpp
++++ b/llvm/lib/Transforms/IPO/Inliner.cpp
+@@ -64,6 +64,9 @@
+ #include <functional>
+ #include <utility>
+ #include <vector>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ 
+ using namespace llvm;
+ 
+@@ -298,6 +301,27 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
+   // be deleted as a batch after inlining.
+   SmallVector<Function *, 4> DeadFunctionsInComdats;
+ 
++#if defined(ENABLE_AUTOTUNER)
++  bool IsAutoTunerEnabled =
++      autotuning::Engine.isEnabled() &&
++      autotuning::Engine.isTuningAllowedForType(autotuning::CallSite);
++  if (IsAutoTunerEnabled) {
++    SmallVector<std::pair<CallBase *, int>, 16> CallsCopy = Calls;
++    for (int I = 0; I < (int)CallsCopy.size(); ++I) {
++      CallBase &CB = *CallsCopy[I].first;
++      DebugLoc DLoc = CB.getDebugLoc();
++      if (!CB.getCaller() || !CB.getCalledFunction() || !DLoc)
++        continue;
++      autotuning::CallSiteLocation Loc = autotuning::CallSiteLocation{
++          &CB, CB.getCaller(), CB.getCalledFunction(),
++          autotuning::SourceLocation{DLoc->getFilename().str(), DLoc->getLine(),
++                                     DLoc->getColumn()}};
++      autotuning::Engine.insertCallSiteLoc(Loc);
++    }
++    autotuning::Engine.cleanCallSiteLoc();
++  }
++#endif
++
+   // Loop forward over all of the calls. Note that we cannot cache the size as
+   // inlining can introduce new calls that need to be processed.
+   for (int I = 0; I < (int)Calls.size(); ++I) {
+@@ -412,6 +436,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
+           if (NewCallee) {
+             if (!NewCallee->isDeclaration()) {
+               Calls.push_back({ICB, NewHistoryID});
++#if defined(ENABLE_AUTOTUNER)
++              if (IsAutoTunerEnabled)
++                if (ICB->getDebugLoc())
++                  autotuning::Engine.updateCallSiteLocs(
++                      CB, ICB, ICB->getCalledFunction(),
++                      ICB->getDebugLoc()->getLine());
++#endif
+               // Continually inlining through an SCC can result in huge compile
+               // times and bloated code since we arbitrarily stop at some point
+               // when the inliner decides it's not profitable to inline anymore.
+@@ -527,6 +558,11 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
+     FAM.invalidate(F, PreservedAnalyses::none());
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (IsAutoTunerEnabled)
++    autotuning::Engine.clearCallSiteLocs();
++#endif
++
+   // We must ensure that we only delete functions with comdats if every function
+   // in the comdat is going to be deleted.
+   if (!DeadFunctionsInComdats.empty()) {
+diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
+index a53baecd4776..9590cf625c64 100644
+--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
++++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
+@@ -1212,6 +1212,20 @@ bool SampleProfileLoader::inlineHotFunctions(
+             }
+           }
+         }
++#if defined(ENABLE_AUTOTUNER)
++        if (autotuning::Engine.isEnabled()) {
++          // If a callsite is hot/cold, mark its corresponding callee as
++          // hot/cold respectively so that auto-tuning engine will be able to
++          // selectively dump code regions as tuning opportunities.
++          if (const CallInst *CI = dyn_cast<CallInst>(&I))
++            if (Function *Callee = CI->getCalledFunction()) {
++              if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
++                Callee->getATEFunction().setHot();
++              else
++                Callee->getATEFunction().setCold();
++            }
++        }
++#endif
+       }
+       if (Hot || ExternalInlineAdvisor) {
+         CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
+diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+index 424f1d433606..955353944b14 100644
+--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
++++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+@@ -30,6 +30,7 @@ add_llvm_component_library(LLVMInstrumentation
+ 
+   LINK_COMPONENTS
+   Analysis
++  AutoTuner
+   Core
+   Demangle
+   MC
+diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+index 3c8f25d73c62..b9459b59e704 100644
+--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
++++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+@@ -2132,6 +2132,10 @@ static bool annotateAllFunctions(
+     F->addFnAttr(Attribute::InlineHint);
+     LLVM_DEBUG(dbgs() << "Set inline attribute to function: " << F->getName()
+                       << "\n");
++#if defined(ENABLE_AUTOTUNER)
++    if (autotuning::Engine.isEnabled())
++      F->getATEFunction().setHot();
++#endif
+   }
+   for (auto &F : ColdFunctions) {
+     // Only set when there is no Attribute::Hot set by the user. For Hot
+@@ -2148,6 +2152,10 @@ static bool annotateAllFunctions(
+     F->addFnAttr(Attribute::Cold);
+     LLVM_DEBUG(dbgs() << "Set cold attribute to function: " << F->getName()
+                       << "\n");
++#if defined(ENABLE_AUTOTUNER)
++    if (autotuning::Engine.isEnabled())
++      F->getATEFunction().setCold();
++#endif
+   }
+   return true;
+ }
+diff --git a/llvm/lib/Transforms/Scalar/AutoTuningCompile.cpp b/llvm/lib/Transforms/Scalar/AutoTuningCompile.cpp
+new file mode 100644
+index 000000000000..c33cb7cfc256
+--- /dev/null
++++ b/llvm/lib/Transforms/Scalar/AutoTuningCompile.cpp
+@@ -0,0 +1,334 @@
++#if defined(ENABLE_AUTOTUNER)
++//===--------------- AutoTuningCompile.cpp - Auto-Tuning ------------------===//
++//
++//                     The LLVM Compiler Infrastructure
++//
++// This file is distributed under the University of Illinois Open Source
++// License. See LICENSE.TXT for details.
++//
++// Copyright (C) 2017-2022, Huawei Technologies Co., Ltd. All rights reserved.
++//
++//===----------------------------------------------------------------------===//
++//
++/// \file
++/// This pass implements incremental compilation for AutoTuner to reduce the
++/// compilation time for tuning process.
++/// This pass performs 2 operations.
++/// 1. Writing module level IR files which can be used in subsequent
++///    compilations for AutoTuner flow. So clang frontend don't have to process
++///    the source code from scratch.
++/// 2. Add/Remove attributes for modules and functions to enable/disable
++///    execution of optimization pass(es). It further reduces the compilation
++///    time by skipping optimization pass(es) (If feasible).
++//
++//===----------------------------------------------------------------------===//
++
++#include "llvm/Transforms/Scalar/AutoTuningCompile.h"
++#include "llvm/Analysis/AutotuningDump.h"
++#include "llvm/AutoTuner/AutoTuning.h"
++#include "llvm/InitializePasses.h"
++#include "llvm/Support/CommandLine.h"
++#include "llvm/Transforms/Scalar.h"
++#include <string>
++
++// Enable debug messages for AutoTuning Compilation.
++#define DEBUG_TYPE "autotuning-compile"
++
++using namespace llvm;
++
++extern cl::opt<AutoTuningCompileOpt> AutoTuningCompileMode;
++
++AutoTuningOptPassGate SkipPasses = AutoTuningOptPassGate(true);
++AutoTuningOptPassGate RunPasses = AutoTuningOptPassGate(false);
++bool AutoTuningCompileModule::SkipCompilation = false;
++
++static void writeFiles(Module &M, std::string Pass) {
++  if (autotuning::Engine.isGenerateOutput()) {
++    switch (AutoTuningCompileMode) {
++    case Basic:
++    case CoarseGrain:
++      if (Pass == autotuning::CompileOptionStart) {
++        LLVM_DEBUG(dbgs() << "AutoTuningCompile: IR files writing before Pass: "
++                          << Pass << ".\n");
++        auto ATD = new AutotuningDumpLegacy(/* Incremental Compilation */ true);
++        ATD->runOnModule(M);
++      }
++      break;
++    case FineGrain:
++      if (autotuning::Engine.hasOpportunities()) {
++        LLVM_DEBUG(dbgs() << "AutoTuningCompile: IR files writing before Pass: "
++                          << Pass << ".\n");
++        auto ATD = new AutotuningDumpLegacy(/* Incremental Compilation */ true);
++        ATD->runOnModule(M);
++      }
++      break;
++    default:
++      llvm_unreachable("AutoTuningCompile: Unknown AutoTuner Incremental "
++                       "Compilation mode.\n");
++    }
++  }
++}
++
++bool AutoTuningOptPassGate::shouldRunPass(const StringRef PassName,
++                                          StringRef IRDescription) {
++  LLVM_DEBUG(dbgs() << "Skip pass '" << PassName
++                    << "': " << (Skip ? "True" : "False") << '\n');
++  return !Skip;
++}
++
++bool AutoTuningOptPassGate::checkPass(const StringRef PassName,
++                                      const StringRef TargetDesc) {
++  if (PassName.startswith("AutoTuningCompile")) {
++    LLVM_DEBUG(dbgs() << "Running '" << PassName << "'pass.\n");
++    return true;
++  }
++
++  LLVM_DEBUG(dbgs() << "Skip pass '" << PassName
++                    << "': " << (Skip ? "True" : "False") << '\n');
++  return !Skip;
++}
++
++AutoTuningCompileModule::AutoTuningCompileModule(std::string Pass) {
++  this->Pass = Pass;
++}
++
++void AutoTuningCompileModule::writeIRFiles(Module &M) const {
++  writeFiles(M, Pass);
++}
++
++bool AutoTuningCompileModule::modifyCompilationPipeline(Module &M) const {
++  bool Changed = false;
++  LLVM_DEBUG(dbgs() << "AutoTuningCompile: Deciding to enable/disable "
++                       "optimization of module/functions. Pass: "
++                    << Pass << '\n');
++
++  StringRef Filename = M.getName();
++  size_t Pos = Filename.rfind(".ll");
++  if (Pos == StringRef::npos) {
++    errs() << "AutoTuningCompile: Source file is not IR (.ll) file. "
++              "Disabling incremental compilation.\n";
++    AutoTuningCompileMode = Inactive;
++    return Changed;
++  }
++  Filename = Filename.substr(0, Pos);
++
++  switch (AutoTuningCompileMode) {
++  case Basic:
++  case CoarseGrain:
++    LLVM_DEBUG(dbgs() << "AutoTuningCompile: No change in opt pipeline for "
++                         "Basic/CoarseGrain incremental compilation mode.\n");
++    break;
++  case FineGrain: {
++    if (Pass == autotuning::CompileOptionStart) {
++      M.getContext().setOptPassGate(SkipPasses);
++      getAutoTuningOptPassGate().setSkip(true);
++      setSkipCompilation(true);
++      LLVM_DEBUG(dbgs() << "AutoTuningCompile: SkipPasses enabled.\n");
++    } else if (getSkipCompilation() &&
++               (autotuning::Engine.shouldRunOptPass(Filename.str(), Pass) ||
++                Pass == "end")) {
++      M.getContext().setOptPassGate(RunPasses);
++      getAutoTuningOptPassGate().setSkip(false);
++      setSkipCompilation(false);
++      LLVM_DEBUG(dbgs() << "AutoTuningCompile: SkipPasses disabled.\n");
++    } else
++      LLVM_DEBUG(dbgs() << "AutoTuningCompile: Old decision (SkipPasses = "
++                        << (getSkipCompilation() ? "True" : "False")
++                        << " ) continued.\n");
++
++    Changed = true;
++    break;
++  }
++  default:
++    llvm_unreachable(
++        "AutoTuningCompile: Unknown AutoTuner Incremental Compilation mode.\n");
++  }
++
++  return Changed;
++}
++
++bool AutoTuningCompileModule::run(Module &M) {
++  bool Changed = false;
++  if (AutoTuningCompileMode == Inactive)
++    return Changed;
++
++  if (!autotuning::Engine.isEnabled()) {
++    LLVM_DEBUG(dbgs() << "AutoTuningCompile: AutoTuner is not enabled.\n");
++    return Changed;
++  }
++
++  writeIRFiles(M);
++
++  if (autotuning::Engine.isParseInput())
++    Changed |= modifyCompilationPipeline(M);
++
++  return Changed;
++}
++
++AutoTuningCompileModuleLegacy::AutoTuningCompileModuleLegacy(std::string Pass)
++    : ModulePass(AutoTuningCompileModuleLegacy::ID) {
++  this->Pass = Pass;
++}
++
++bool AutoTuningCompileModuleLegacy::runOnModule(Module &M) {
++  AutoTuningCompileModule Impl(Pass);
++  return Impl.run(M);
++}
++
++char AutoTuningCompileModuleLegacy::ID = 0;
++
++StringRef AutoTuningCompileModuleLegacy::getPassName() const {
++  return "AutoTuner Incremental Compilation";
++}
++
++INITIALIZE_PASS(AutoTuningCompileModuleLegacy, "autotuning-compile-module",
++                "AutoTuner Incremental Compilation", false, false)
++
++// Public interface to the AutoTuningCompile pass
++ModulePass *llvm::createAutoTuningCompileModuleLegacyPass(std::string Pass) {
++  return new AutoTuningCompileModuleLegacy(Pass);
++}
++
++PreservedAnalyses AutoTuningCompileModulePass::run(Module &M,
++                                                   ModuleAnalysisManager &) {
++  AutoTuningCompileModule Impl(Pass);
++  Impl.run(M);
++  return PreservedAnalyses::all();
++}
++
++AutoTuningCompileFunction::AutoTuningCompileFunction(std::string Pass) {
++  this->Pass = Pass;
++}
++
++void AutoTuningCompileFunction::writeIRFiles(Module &M) {
++  if (IsModuleWritten)
++    return;
++  IsModuleWritten = true;
++  writeFiles(M, Pass);
++}
++
++bool AutoTuningCompileFunction::modifyCompilationPipeline(Function &F) {
++  bool Changed = false;
++  LLVM_DEBUG(dbgs() << "AutoTuningCompile: Deciding to enable/disable "
++                       "optimization of module/functions. Pass: "
++                    << Pass << '\n');
++  Module *M = F.getParent();
++  StringRef Filename = M->getName();
++  size_t Pos = Filename.rfind(".ll");
++  if (Pos == StringRef::npos) {
++    errs() << "AutoTuningCompile: Source file is not IR (.ll) file. "
++              "Disabling incremental compilation.\n";
++    AutoTuningCompileMode = Inactive;
++    return Changed;
++  }
++  Filename = Filename.substr(0, Pos);
++
++  switch (AutoTuningCompileMode) {
++  case Basic:
++  case CoarseGrain:
++    LLVM_DEBUG(dbgs() << "AutoTuningCompile: No change in opt pipeline for "
++                         "Basic/CoarseGrain incremental compilation mode.\n");
++    break;
++  case FineGrain: {
++    if (!AutoTuningCompileModule::getSkipCompilation() &&
++        Pass == autotuning::CompileOptionStart) {
++      if (!SkipDecision) {
++        M->getContext().setOptPassGate(SkipPasses);
++        getAutoTuningOptPassGate().setSkip(true);
++        SkipDecision = true;
++      }
++      AutoTuningCompileModule::setSkipCompilation(true);
++      LLVM_DEBUG(dbgs() << "AutoTuningCompile: SkipPasses enabled.\n");
++    } else if (AutoTuningCompileModule::getSkipCompilation() &&
++               Pass != autotuning::CompileOptionStart &&
++               (autotuning::Engine.shouldRunOptPass(Filename.str(), Pass) ||
++                Pass == autotuning::CompileOptionEnd)) {
++      M->getContext().setOptPassGate(RunPasses);
++      getAutoTuningOptPassGate().setSkip(false);
++      SkipDecision = false;
++      AutoTuningCompileModule::setSkipCompilation(false);
++      LLVM_DEBUG(dbgs() << "AutoTuningCompile: SkipPasses disabled.\n");
++    } else
++      LLVM_DEBUG(dbgs() << "AutoTuningCompile: Old decision (SkipPasses = "
++                        << (AutoTuningCompileModule::getSkipCompilation()
++                                ? "True"
++                                : "False")
++                        << " ) continued.\n");
++
++    Changed = true;
++    break;
++  }
++  default:
++    llvm_unreachable(
++        "AutoTuningCompile: Unknown AutoTuner Incremental Compilation mode.\n");
++  }
++
++  return Changed;
++}
++
++bool AutoTuningCompileFunction::run(Function &F) {
++  bool Changed = false;
++  if (AutoTuningCompileMode == Inactive)
++    return Changed;
++
++  if (!autotuning::Engine.isEnabled()) {
++    LLVM_DEBUG(dbgs() << "AutoTuningCompile: AutoTuner is not enabled.\n");
++    return Changed;
++  }
++
++  writeIRFiles(*F.getParent());
++
++  if (autotuning::Engine.isParseInput())
++    Changed |= modifyCompilationPipeline(F);
++
++  return Changed;
++}
++
++AutoTuningCompileFunctionLegacy::AutoTuningCompileFunctionLegacy(
++    std::string Pass)
++    : FunctionPass(AutoTuningCompileFunctionLegacy::ID) {
++  this->Pass = Pass;
++}
++
++bool AutoTuningCompileFunctionLegacy::runOnFunction(Function &F) {
++  AutoTuningCompileFunction Impl(Pass);
++  return Impl.run(F);
++}
++
++char AutoTuningCompileFunctionLegacy::ID = 0;
++
++StringRef AutoTuningCompileFunctionLegacy::getPassName() const {
++  return "AutoTuner Incremental Compilation";
++}
++
++INITIALIZE_PASS(AutoTuningCompileFunctionLegacy, "autotuning-compile-function",
++                "AutoTuner Incremental Compilation", false, false)
++
++// Public interface to the AutoTuningCompile pass
++FunctionPass *
++llvm::createAutoTuningCompileFunctionLegacyPass(std::string Pass) {
++  return new AutoTuningCompileFunctionLegacy(Pass);
++}
++
++PreservedAnalyses
++AutoTuningCompileFunctionPass::run(Function &F, FunctionAnalysisManager &AM) {
++  AutoTuningCompileFunction Impl(Pass);
++  Impl.run(F);
++  return PreservedAnalyses::all();
++}
++
++PreservedAnalyses
++AutoTuningCompileLoopPass::run(Loop &L, LoopAnalysisManager &AM,
++                               LoopStandardAnalysisResults &AR, LPMUpdater &U) {
++  AutoTuningCompileFunction Impl(Pass);
++  Function *F = L.getHeader()->getParent();
++  Impl.run(*F);
++  return PreservedAnalyses::all();
++}
++
++AutoTuningOptPassGate &llvm::getAutoTuningOptPassGate() {
++  static AutoTuningOptPassGate AutoTuningGate;
++  return AutoTuningGate;
++}
++
++#endif
+diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
+index eb008c15903a..e5a82ea8f923 100644
+--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
++++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
+@@ -2,6 +2,7 @@ add_llvm_component_library(LLVMScalarOpts
+   ADCE.cpp
+   AlignmentFromAssumptions.cpp
+   AnnotationRemarks.cpp
++  AutoTuningCompile.cpp
+   BDCE.cpp
+   CallSiteSplitting.cpp
+   ConstantHoisting.cpp
+@@ -92,6 +93,7 @@ add_llvm_component_library(LLVMScalarOpts
+   LINK_COMPONENTS
+   AggressiveInstCombine
+   Analysis
++  AutoTuner
+   Core
+   InstCombine
+   Support
+diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+index 335b489d3cb2..feb8932eaae7 100644
+--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
++++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+@@ -66,6 +66,9 @@
+ #include <string>
+ #include <tuple>
+ #include <utility>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ 
+ using namespace llvm;
+ 
+@@ -173,6 +176,10 @@ static cl::opt<unsigned>
+                            cl::desc("Default threshold (max size of unrolled "
+                                     "loop), used in all but O3 optimizations"));
+ 
++#if defined(ENABLE_AUTOTUNER)
++static const std::string UnrollCountParamStr = "UnrollCount";
++#endif
++
+ /// A magic value for use with the Threshold parameter to indicate
+ /// that the loop unroll should be performed regardless of how much
+ /// code expansion would result.
+@@ -893,7 +900,12 @@ bool llvm::computeUnrollCount(
+     OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount,
+     bool MaxOrZero, unsigned TripMultiple, unsigned LoopSize,
+     TargetTransformInfo::UnrollingPreferences &UP,
++#if defined(ENABLE_AUTOTUNER)
++    TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound,
++    unsigned int Invocation) {
++#else
+     TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
++#endif
+ 
+   UnrollCostEstimator UCE(*L, LoopSize);
+ 
+@@ -942,6 +954,43 @@ bool llvm::computeUnrollCount(
+     }
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // Priority 2.5 is using Unroll Count set by AutoTuner (if enabled).
++  if (autotuning::Engine.isEnabled()) {
++    // Create a code region for current loop. This code region will be added to
++    // opportunity list once all the relevant information is gathered.
++    autotuning::Engine.initContainer(L, DEBUG_TYPE,
++                                     L->getHeader()->getParent()->getName(),
++                                     /* addOpportunity */ false, Invocation);
++
++    int NewValue = 0; // the int value is set by lookUpParams()
++    bool UnrollCountChanged = L->lookUpParams<int>("UnrollCount", NewValue);
++
++    if (UnrollCountChanged) {
++      // Setting the UP.Count with the value suggested by AutoTuner.
++      // AutoTuner will use UnrollCount = 0, 1, X, Y, Z in case of dynamic
++      // configuration and UnrollCount = 0, 1, 2, 4, 8 otherwise to find
++      // optimal configuration. Compiler will unroll the loop with suggested
++      // UnrollCount except when UnrollCount = 1 where AutoTuner is suggesting
++      // to try loop peeling.
++      UP.Count = NewValue;
++      UP.AllowExpensiveTripCount = true;
++      UP.Force = true;
++      UP.Runtime = true;
++      if (!UP.AllowRemainder && UP.Count != 1)
++        UP.Count = 0;
++
++      // Check for Loop Peeling
++      if (UP.Count == 1) {
++        computePeelCount(L, LoopSize, PP, TripCount, DT, SE, AC, UP.Threshold);
++        UP.Runtime = (PP.PeelCount) ? false : UP.Runtime;
++      }
++
++      return true;
++    }
++  }
++#endif
++
+   // 3rd priority is exact full unrolling.  This will eliminate all copies
+   // of some exit test.
+   UP.Count = 0;
+@@ -1119,6 +1168,59 @@ bool llvm::computeUnrollCount(
+   return ExplicitUnroll;
+ }
+ 
++#if defined(ENABLE_AUTOTUNER)
++// Given UnrollingPreferences count (UPCount) and TripCount for CodeRegion
++// CR, compute the dynamic Unroll values for tuning and add it to CR.
++static void
++computeAutoTunerDynamicUnrollOptions(unsigned UPCount, unsigned TripCount,
++                                     const autotuning::CodeRegion &CR) {
++  std::vector<unsigned int> DynamicTuningOptions;
++  unsigned int PotentialTuningOptions[2];
++  unsigned int Idx = 0;
++  int Count = -1;
++  unsigned int CurrentOption = 2;
++  unsigned int MaxTuningCount = 64;
++  DynamicTuningOptions.push_back(0);
++  // Add LoopPeeling as an additional option.
++  DynamicTuningOptions.push_back(1);
++  if (!UPCount) {
++    TripCount = (TripCount > MaxTuningCount) ? MaxTuningCount : TripCount;
++    unsigned int Limit = (TripCount == 0) ? 8 : TripCount;
++    DynamicTuningOptions.push_back(TripCount ? TripCount : 8);
++    while (CurrentOption < Limit) {
++      PotentialTuningOptions[Idx] = CurrentOption;
++      CurrentOption *= 2;
++      Idx = (Idx + 1) % 2;
++      ++Count;
++    }
++  } else {
++    while (CurrentOption < UPCount) {
++      PotentialTuningOptions[Idx] = CurrentOption;
++      CurrentOption *= 2;
++      Idx = (Idx + 1) % 2;
++      ++Count;
++    }
++    if (TripCount != UPCount) {
++      if (CurrentOption == UPCount) {
++        CurrentOption *= 2;
++      }
++      if (!TripCount || CurrentOption < TripCount) {
++        PotentialTuningOptions[Idx] = CurrentOption;
++        ++Count;
++      }
++    }
++    if (UPCount != 1)
++      DynamicTuningOptions.push_back(UPCount);
++  }
++
++  Count = std::min(1, Count);
++  while (Count >= 0)
++    DynamicTuningOptions.push_back(PotentialTuningOptions[Count--]);
++
++  CR.addAutoTunerOptions("UnrollCount", DynamicTuningOptions);
++}
++#endif
++
+ static LoopUnrollResult
+ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
+                 const TargetTransformInfo &TTI, AssumptionCache &AC,
+@@ -1132,7 +1234,12 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
+                 std::optional<bool> ProvidedUpperBound,
+                 std::optional<bool> ProvidedAllowPeeling,
+                 std::optional<bool> ProvidedAllowProfileBasedPeeling,
++#if defined(ENABLE_AUTOTUNER)
++                std::optional<unsigned> ProvidedFullUnrollMaxCount,
++                unsigned int Invocation = 0) {
++#else
+                 std::optional<unsigned> ProvidedFullUnrollMaxCount) {
++#endif
+ 
+   LLVM_DEBUG(dbgs() << "Loop Unroll: F["
+                     << L->getHeader()->getParent()->getName() << "] Loop %"
+@@ -1276,11 +1383,28 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
+   // computeUnrollCount() decides whether it is beneficial to use upper bound to
+   // fully unroll the loop.
+   bool UseUpperBound = false;
++
++#if defined(ENABLE_AUTOTUNER)
++  bool IsCountSetExplicitly = computeUnrollCount(
++      L, TTI, DT, LI, &AC, SE, EphValues, &ORE, TripCount, MaxTripCount,
++      MaxOrZero, TripMultiple, LoopSize, UP, PP, UseUpperBound, Invocation);
++  const autotuning::CodeRegion CR = L->getCodeRegion();
++  // computeAutoTunerDynamicUnrollOptions() adds the dynamic Unroll values to
++  // the CodeRegion.
++  computeAutoTunerDynamicUnrollOptions(UP.Count, TripCount, CR);
++
++  if (!UP.Count) {
++    autotuning::Engine.addOpportunity(
++        CR, {{UnrollCountParamStr, std::to_string(UP.Count)}});
++    return LoopUnrollResult::Unmodified;
++  }
++#else
+   bool IsCountSetExplicitly = computeUnrollCount(
+     L, TTI, DT, LI, &AC, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
+       TripMultiple, LoopSize, UP, PP, UseUpperBound);
+   if (!UP.Count)
+     return LoopUnrollResult::Unmodified;
++#endif
+ 
+   if (PP.PeelCount) {
+     assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step");
+@@ -1300,8 +1424,16 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
+       // we had, so we don't want to unroll or peel again.
+       if (PP.PeelProfiledIterations)
+         L->setLoopAlreadyUnrolled();
++#if defined(ENABLE_AUTOTUNER)
++      autotuning::Engine.addOpportunity(
++          CR, {{UnrollCountParamStr, std::to_string(UP.Count)}});
++      return LoopUnrollResult::PartiallyUnrolled;
++    }
++    autotuning::Engine.addOpportunity(CR, {{UnrollCountParamStr, "0"}});
++#else
+       return LoopUnrollResult::PartiallyUnrolled;
+     }
++#endif
+     return LoopUnrollResult::Unmodified;
+   }
+ 
+@@ -1329,8 +1461,18 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
+       {UP.Count, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
+        UP.UnrollRemainder, ForgetAllSCEV},
+       LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
++
++#if defined(ENABLE_AUTOTUNER)
++  if (UnrollResult == LoopUnrollResult::Unmodified) {
++    autotuning::Engine.addOpportunity(CR, {{UnrollCountParamStr, "0"}});
++    return LoopUnrollResult::Unmodified;
++  }
++  autotuning::Engine.addOpportunity(
++      CR, {{UnrollCountParamStr, std::to_string(UP.Count)}});
++#else
+   if (UnrollResult == LoopUnrollResult::Unmodified)
+     return LoopUnrollResult::Unmodified;
++#endif
+ 
+   if (RemainderLoop) {
+     std::optional<MDNode *> RemainderLoopID =
+@@ -1379,6 +1521,20 @@ public:
+   /// Otherwise, forgetAllLoops and rebuild when needed next.
+   bool ForgetAllSCEV;
+ 
++#if defined(ENABLE_AUTOTUNER)
++private:
++  // 'InvocationCounter' keeps track of Invocation of Loop Unroll Pass and
++  // assign it to 'Invocation'. So each LoopUnroll Object knows when it is
++  // being invoked during optimization pipeline. It is used to identify the
++  // Invocation of a pass if it is invoked multiple times. AutoTuner will use
++  // this information to generate the Code Regions and apply the suggested
++  // configuration during the correct invocation of the Loop Unroll Pass.
++  static unsigned int InvocationCounter;
++  unsigned int Invocation;
++
++public:
++#endif
++
+   std::optional<unsigned> ProvidedCount;
+   std::optional<unsigned> ProvidedThreshold;
+   std::optional<bool> ProvidedAllowPartial;
+@@ -1405,6 +1561,9 @@ public:
+         ProvidedAllowPeeling(AllowPeeling),
+         ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
+         ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
++#if defined(ENABLE_AUTOTUNER)
++    Invocation = InvocationCounter++;
++#endif
+     initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
+   }
+ 
+@@ -1431,7 +1590,12 @@ public:
+         /*OnlyFullUnroll*/ false, OnlyWhenForced, ForgetAllSCEV, ProvidedCount,
+         ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,
+         ProvidedUpperBound, ProvidedAllowPeeling,
++#if defined(ENABLE_AUTOTUNER)
++        ProvidedAllowProfileBasedPeeling, ProvidedFullUnrollMaxCount,
++        Invocation);
++#else
+         ProvidedAllowProfileBasedPeeling, ProvidedFullUnrollMaxCount);
++#endif
+ 
+     if (Result == LoopUnrollResult::FullyUnrolled)
+       LPM.markLoopAsDeleted(*L);
+@@ -1449,6 +1613,9 @@ public:
+     getLoopAnalysisUsage(AU);
+   }
+ };
++#if defined(ENABLE_AUTOTUNER)
++unsigned int LoopUnroll::InvocationCounter = 0;
++#endif
+ 
+ } // end anonymous namespace
+ 
+@@ -1496,6 +1663,11 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
+ 
+   std::string LoopName = std::string(L.getName());
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // LoopFullUnrollPass will be invoked first during optimization pipeline.
++  unsigned int Invocation = 0;
++#endif
++
+   bool Changed =
+       tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, ORE,
+                       /*BFI*/ nullptr, /*PSI*/ nullptr,
+@@ -1505,7 +1677,12 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
+                       /*Runtime*/ false, /*UpperBound*/ false,
+                       /*AllowPeeling*/ true,
+                       /*AllowProfileBasedPeeling*/ false,
++#if defined(ENABLE_AUTOTUNER)
++                      /*FullUnrollMaxCount*/ std::nullopt,
++                      /*Invocation*/ Invocation) !=
++#else
+                       /*FullUnrollMaxCount*/ std::nullopt) !=
++#endif
+       LoopUnrollResult::Unmodified;
+   if (!Changed)
+     return PreservedAnalyses::all();
+@@ -1588,6 +1765,11 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
+ 
+   bool Changed = false;
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // LoopUnrollPass will be invoked second during optimization pipeline.
++  unsigned int Invocation = 1;
++#endif
++
+   // The unroller requires loops to be in simplified form, and also needs LCSSA.
+   // Since simplification may add new inner loops, it has to run before the
+   // legality and profitability checks. This means running the loop unroller
+@@ -1630,7 +1812,12 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
+         /*Count*/ std::nullopt,
+         /*Threshold*/ std::nullopt, UnrollOpts.AllowPartial,
+         UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling,
++#if defined(ENABLE_AUTOTUNER)
++        UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount,
++        Invocation);
++#else
+         UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
++#endif
+     Changed |= Result != LoopUnrollResult::Unmodified;
+ 
+     // The parent must not be damaged by unrolling!
+diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
+index 37b032e4d7c7..4b140e8d600b 100644
+--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
++++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
+@@ -64,4 +64,8 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
+   initializeStraightLineStrengthReduceLegacyPassPass(Registry);
+   initializePlaceBackedgeSafepointsLegacyPassPass(Registry);
+   initializeLoopSimplifyCFGLegacyPassPass(Registry);
++#if defined(ENABLE_AUTOTUNER)
++  initializeAutoTuningCompileFunctionLegacyPass(Registry);
++  initializeAutoTuningCompileModuleLegacyPass(Registry);
++#endif
+ }
+diff --git a/llvm/lib/Transforms/Scalar/Sink.cpp b/llvm/lib/Transforms/Scalar/Sink.cpp
+index 8b99f73b850b..b3c60686e252 100644
+--- a/llvm/lib/Transforms/Scalar/Sink.cpp
++++ b/llvm/lib/Transforms/Scalar/Sink.cpp
+@@ -248,6 +248,11 @@ namespace {
+     }
+ 
+     bool runOnFunction(Function &F) override {
++#if defined(ENABLE_AUTOTUNER)
++      if (skipFunction(F))
++        return false;
++#endif
++
+       auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+       auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+       auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
+index a870071f3f64..8616e7b923c0 100644
+--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
++++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
+@@ -93,6 +93,7 @@ add_llvm_component_library(LLVMTransformUtils
+ 
+   LINK_COMPONENTS
+   Analysis
++  AutoTuner
+   Core
+   Support
+   TargetParser
+diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp
+index c36b0533580b..20a4edcb29db 100644
+--- a/llvm/lib/Transforms/Utils/LCSSA.cpp
++++ b/llvm/lib/Transforms/Utils/LCSSA.cpp
+@@ -491,6 +491,11 @@ char &llvm::LCSSAID = LCSSAWrapperPass::ID;
+ 
+ /// Transform \p F into loop-closed SSA form.
+ bool LCSSAWrapperPass::runOnFunction(Function &F) {
++#if defined(ENABLE_AUTOTUNER)
++  if (skipFunction(F))
++    return false;
++#endif
++
+   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+   auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+index 3e604fdf2e11..2e42e7f1397f 100644
+--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
++++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+@@ -69,6 +69,9 @@
+ #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+ #include "llvm/Transforms/Utils/Local.h"
+ #include "llvm/Transforms/Utils/LoopUtils.h"
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ using namespace llvm;
+ 
+ #define DEBUG_TYPE "loop-simplify"
+@@ -793,6 +796,11 @@ Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
+ /// it in any convenient order) inserting preheaders...
+ ///
+ bool LoopSimplify::runOnFunction(Function &F) {
++#if defined(ENABLE_AUTOTUNER)
++  if (autotuning::Engine.isEnabled() && skipFunction(F))
++    return false;
++#endif
++
+   bool Changed = false;
+   LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+   DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+index 511dd61308f9..2d2c3e50514b 100644
+--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
++++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+@@ -69,6 +69,9 @@
+ #include <numeric>
+ #include <type_traits>
+ #include <vector>
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
+ 
+ namespace llvm {
+ class DataLayout;
+diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+index 998dfd956575..f2c5c04abb13 100644
+--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
++++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+@@ -21,6 +21,7 @@ add_llvm_component_library(LLVMVectorize
+ 
+   LINK_COMPONENTS
+   Analysis
++  AutoTuner
+   Core
+   Support
+   TransformUtils
+diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+index f923f0be6621..f13ce6853666 100644
+--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
++++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+@@ -113,6 +113,18 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
+   // Populate values with existing loop metadata.
+   getHintsFromMetadata();
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (autotuning::Engine.isEnabled()) {
++    int NewValue = 0;
++    bool VectorizationInterleaveChanged =
++        L->lookUpParams<int>("VectorizationInterleave", NewValue);
++
++    if (VectorizationInterleaveChanged) {
++      Interleave.Value = NewValue;
++    }
++  }
++#endif
++
+   // force-vector-interleave overrides DisableInterleaving.
+   if (VectorizerParams::isInterleaveForced())
+     Interleave.Value = VectorizerParams::VectorizationInterleave;
+diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+index b603bbe55dc9..46fab860f5a3 100644
+--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
++++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+@@ -10178,6 +10178,22 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
+       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
+                               !EnableLoopVectorization) {}
+ 
++#if defined(ENABLE_AUTOTUNER)
++// Given the iterleave count (IC) and CR, compute the dynamic values for
++// interleave count. Then add it to CR.
++static void
++computeAutoTunerDynamicInterleaveOptions(unsigned IC,
++                                         const autotuning::CodeRegion &CR) {
++
++  std::vector<unsigned int> AutoTunerOptions{1, 2, 4};
++  if (std::find(AutoTunerOptions.begin(), AutoTunerOptions.end(), IC) ==
++      AutoTunerOptions.end())
++    AutoTunerOptions[2] = IC;
++
++  CR.addAutoTunerOptions("VectorizationInterleave", AutoTunerOptions);
++}
++#endif
++
+ bool LoopVectorizePass::processLoop(Loop *L) {
+   assert((EnableVPlanNativePath || L->isInnermost()) &&
+          "VPlan-native path is not enabled. Only process inner loops.");
+@@ -10190,6 +10206,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
+                     << L->getHeader()->getParent()->getName() << "' from "
+                     << DebugLocStr << "\n");
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // Initialize the loop for auto-tuning but do not add it
++  // as an tuning opportunity yet.
++  autotuning::Engine.initContainer(
++      L, LV_NAME, L->getHeader()->getParent()->getName(), false);
++#endif
+   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
+ 
+   LLVM_DEBUG(
+@@ -10422,6 +10444,18 @@ bool LoopVectorizePass::processLoop(Loop *L) {
+     InterleaveLoop = false;
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (!VectorizerParams::isInterleaveForced()) {
++    // Compute the dynamic values for VectorizationInterleave and add it to the
++    // CodeRegion.
++    computeAutoTunerDynamicInterleaveOptions(IC, L->getCodeRegion());
++
++    // Add the current loop as a tuning opportunity explicitly.
++    autotuning::Engine.addOpportunity(
++        L->getCodeRegion(), {{"VectorizationInterleave", std::to_string(IC)}});
++  }
++#endif
++
+   // Override IC if user provided an interleave count.
+   IC = UserIC > 0 ? UserIC : IC;
+ 
+diff --git a/llvm/test/AutoTuning/AutotuningDump/Inputs/unroll_template.yaml b/llvm/test/AutoTuning/AutotuningDump/Inputs/unroll_template.yaml
+new file mode 100644
+index 000000000000..f483a269906a
+--- /dev/null
++++ b/llvm/test/AutoTuning/AutotuningDump/Inputs/unroll_template.yaml
+@@ -0,0 +1,8 @@
++--- !AutoTuning
++Pass:            loop-unroll
++Name:            [name]
++Function:        foo
++CodeRegionType:  loop
++Args:
++  - UnrollCount: [number]
++...
+diff --git a/llvm/test/AutoTuning/AutotuningDump/create-data-dir.ll b/llvm/test/AutoTuning/AutotuningDump/create-data-dir.ll
+new file mode 100644
+index 000000000000..ceb9b4fb2ca6
+--- /dev/null
++++ b/llvm/test/AutoTuning/AutotuningDump/create-data-dir.ll
+@@ -0,0 +1,65 @@
++; UNSUPPORTED: windows
++; RUN: sed 's#\[number\]#0#g; s#\[name\]#for.body#g' \
++; RUN:     %S/Inputs/unroll_template.yaml > %t.DEFAULT.yaml
++; RUN: opt --disable-output %s -S -passes='require<autotuning-dump>' \
++; RUN:     -auto-tuning-input=%t.DEFAULT.yaml -auto-tuning-config-id=1
++; RUN: cat %T/../autotune_datadir/create-data-dir.ll/1.ll | FileCheck %s
++; RUN: rm -rf %T/../autotune_datadir/*
++
++; RUN: cp %t.DEFAULT.yaml %T/../autotune_datadir/config.yaml
++; RUN: opt %s -S -passes='require<autotuning-dump>' -auto-tuning-config-id=1
++; RUN: cat %T/../autotune_datadir/create-data-dir.ll/1.ll | FileCheck %s
++; RUN: rm -rf %T/../autotune_datadir/*
++
++; RUN: cp %t.DEFAULT.yaml %T/../autotune_datadir/config.yaml
++; RUN: opt %s -S -passes='require<autotuning-dump>' -enable-autotuning-dump
++; RUN: echo -n %T/../autotune_datadir/IR_files/ > %t.filename
++; RUN: echo -n "create-data-dir.ll/" >> %t.filename
++; RUN: echo -n %s | sed 's#/#_#g' >> %t.filename
++; RUN: echo -n ".ll" >> %t.filename
++; RUN: cat %t.filename | xargs cat | FileCheck %s
++; RUN: rm -rf %T/../autotune_datadir
++
++; ModuleID = 'search.c'
++source_filename = "search.c"
++target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
++target triple = "aarch64-unknown-linux-gnu"
++
++; Function Attrs: argmemonly nofree norecurse nosync nounwind readonly uwtable
++define dso_local i32 @search(ptr nocapture noundef readonly %Arr, i32 noundef %Value, i32 noundef %Size) {
++entry:
++  %cmp5 = icmp sgt i32 %Size, 0
++  br i1 %cmp5, label %for.body.preheader, label %for.end
++
++for.body.preheader:                               ; preds = %entry
++  %wide.trip.count = zext i32 %Size to i64
++  br label %for.body
++
++for.body:                                         ; preds = %for.body.preheader, %for.inc
++  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
++  %arrayidx = getelementptr inbounds i32, ptr %Arr, i64 %indvars.iv
++  %0 = load i32, ptr %arrayidx, align 4
++  %cmp1 = icmp eq i32 %0, %Value
++  br i1 %cmp1, label %for.end.loopexit.split.loop.exit, label %for.inc
++
++for.inc:                                          ; preds = %for.body
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
++  br i1 %exitcond.not, label %for.end, label %for.body
++
++for.end.loopexit.split.loop.exit:                 ; preds = %for.body
++  %1 = trunc i64 %indvars.iv to i32
++  br label %for.end
++
++for.end:                                          ; preds = %for.inc, %for.end.loopexit.split.loop.exit, %entry
++  %Idx.0.lcssa = phi i32 [ 0, %entry ], [ %1, %for.end.loopexit.split.loop.exit ], [ %Size, %for.inc ]
++  ret i32 %Idx.0.lcssa
++}
++
++; Check that only loop body is inside the IR File.
++; CHECK-LABEL: for.body:                                         ; preds =
++; CHECK-NEXT: %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
++; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %Arr, i64 %indvars.iv
++; CHECK-NEXT: %0 = load i32, ptr %arrayidx, align 4
++; CHECK-NEXT: %cmp1 = icmp eq i32 %0, %Value
++; CHECK-NEXT: br i1 %cmp1, label %for.end.loopexit.split.loop.exit, label %for.inc
+diff --git a/llvm/test/AutoTuning/AutotuningDump/unroll.ll b/llvm/test/AutoTuning/AutotuningDump/unroll.ll
+new file mode 100644
+index 000000000000..e8243da55fff
+--- /dev/null
++++ b/llvm/test/AutoTuning/AutotuningDump/unroll.ll
+@@ -0,0 +1,35 @@
++; RUN: rm -rf %T.tmp/Output
++; RUN: mkdir -p %T.tmp/Output
++; RUN: rm %t.DEFAULT.yaml -rf
++; RUN: sed 's#\[number\]#0#g; s#\[name\]#for.body#g' %S/Inputs/unroll_template.yaml > %t.DEFAULT.yaml
++; RUN: env AUTOTUNE_DATADIR=%T.tmp/Output opt %s -S -passes='require<autotuning-dump>' \
++; RUN:     -auto-tuning-input=%t.DEFAULT.yaml -auto-tuning-config-id=1
++; RUN: env AUTOTUNE_DATADIR=%T.tmp/Output opt %s -S -passes='require<autotuning-dump>' \
++; RUN:     -auto-tuning-input=%t.DEFAULT.yaml -auto-tuning-config-id=2
++; RUN: cat %T.tmp/Output/unroll.ll/1.ll | FileCheck %s -check-prefix=DEFAULT
++; RUN: cat %T.tmp/Output/unroll.ll/2.ll | FileCheck %s -check-prefix=DEFAULT
++; UNSUPPORTED: windows
++
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++for.end:                                          ; preds = %for.body
++  ret void
++}
++; Check that only loop body is inside the IR File.
++; DEFAULT-LABEL: for.body:                                         ; preds = %for.body, %entry
++; DEFAULT-NEXT: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
++; DEFAULT-NEXT:  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
++; DEFAULT:  %exitcond = icmp eq i64 %indvars.iv.next, 64
++; DEFAULT:  br i1 %exitcond, label %for.end, label %for.body
++
++; RUN: rm -rf %T.tmp/Output
+diff --git a/llvm/test/AutoTuning/BaselineConfig/Inputs/autotune_datadir/baseline_config.yaml b/llvm/test/AutoTuning/BaselineConfig/Inputs/autotune_datadir/baseline_config.yaml
+new file mode 100644
+index 000000000000..a5e669c17a71
+--- /dev/null
++++ b/llvm/test/AutoTuning/BaselineConfig/Inputs/autotune_datadir/baseline_config.yaml
+@@ -0,0 +1,9 @@
++!AutoTuning {Args: [{UnrollCount: 0}], CodeRegionHash: 12835463591102937421,
++  CodeRegionType: loop, Function: test, Invocation: 0, Name: for.body,
++  Pass: loop-unroll}
++--- !AutoTuning {Args: [{VectorizationInterleave: 2}],
++  CodeRegionHash: 12835463591102937421, CodeRegionType: loop, Function: test,
++  Invocation: 0, Name: for.body, Pass: loop-vectorize}
++--- !AutoTuning {Args: [{UnrollCount: 0}], CodeRegionHash: 8430337282115614432,
++  CodeRegionType: loop, Function: test, Invocation: 1, Name: vector.body,
++  Pass: loop-unroll}
+diff --git a/llvm/test/AutoTuning/BaselineConfig/Inputs/autotune_datadir/random_config.yaml b/llvm/test/AutoTuning/BaselineConfig/Inputs/autotune_datadir/random_config.yaml
+new file mode 100644
+index 000000000000..738cf55ffe9a
+--- /dev/null
++++ b/llvm/test/AutoTuning/BaselineConfig/Inputs/autotune_datadir/random_config.yaml
+@@ -0,0 +1,9 @@
++!AutoTuning {Args: [{UnrollCount: 2}], CodeRegionHash: 12835463591102937421,
++  CodeRegionType: loop, Function: test, Invocation: 0, Name: for.body,
++  Pass: loop-unroll}
++--- !AutoTuning {Args: [{VectorizationInterleave: 2}],
++  CodeRegionHash: 12835463591102937421, CodeRegionType: loop, Function: test,
++  Invocation: 0, Name: for.body, Pass: loop-vectorize}
++--- !AutoTuning {Args: [{UnrollCount: 0}], CodeRegionHash: 8430337282115614432,
++  CodeRegionType: loop, Function: test, Invocation: 1, Name: vector.body,
++  Pass: loop-unroll}
+diff --git a/llvm/test/AutoTuning/BaselineConfig/Inputs/test.ll b/llvm/test/AutoTuning/BaselineConfig/Inputs/test.ll
+new file mode 100644
+index 000000000000..667a076b2d23
+--- /dev/null
++++ b/llvm/test/AutoTuning/BaselineConfig/Inputs/test.ll
+@@ -0,0 +1,117 @@
++; ModuleID = 'test.c'
++source_filename = "test.c"
++target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
++target triple = "aarch64-unknown-linux-gnu"
++
++@.str = private unnamed_addr constant [12 x i8] c"tmp <= 10.0\00", align 1
++@.str.1 = private unnamed_addr constant [7 x i8] c"test.c\00", align 1
++@__PRETTY_FUNCTION__.test = private unnamed_addr constant [12 x i8] c"void test()\00", align 1
++
++; Function Attrs: nounwind uwtable
++define dso_local void @test() #0 {
++entry:
++  %cs = alloca i32, align 4
++  %flush = alloca ptr, align 8
++  %i = alloca i32, align 4
++  %tmp = alloca double, align 8
++  call void @llvm.lifetime.start.p0(i64 4, ptr %cs) #5
++  store i32 16431360, ptr %cs, align 4, !tbaa !6
++  call void @llvm.lifetime.start.p0(i64 8, ptr %flush) #5
++  %0 = load i32, ptr %cs, align 4, !tbaa !6
++  %conv = sext i32 %0 to i64
++  %call = call noalias ptr @calloc(i64 noundef %conv, i64 noundef 8) #6
++  store ptr %call, ptr %flush, align 8, !tbaa !10
++  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #5
++  call void @llvm.lifetime.start.p0(i64 8, ptr %tmp) #5
++  store double 0.000000e+00, ptr %tmp, align 8, !tbaa !12
++  store i32 0, ptr %i, align 4, !tbaa !6
++  br label %for.cond
++
++for.cond:                                         ; preds = %for.inc, %entry
++  %1 = load i32, ptr %i, align 4, !tbaa !6
++  %2 = load i32, ptr %cs, align 4, !tbaa !6
++  %cmp = icmp slt i32 %1, %2
++  br i1 %cmp, label %for.body, label %for.end
++
++for.body:                                         ; preds = %for.cond
++  %3 = load ptr, ptr %flush, align 8, !tbaa !10
++  %4 = load i32, ptr %i, align 4, !tbaa !6
++  %idxprom = sext i32 %4 to i64
++  %arrayidx = getelementptr inbounds double, ptr %3, i64 %idxprom
++  %5 = load double, ptr %arrayidx, align 8, !tbaa !12
++  %6 = load double, ptr %tmp, align 8, !tbaa !12
++  %add = fadd double %6, %5
++  store double %add, ptr %tmp, align 8, !tbaa !12
++  br label %for.inc
++
++for.inc:                                          ; preds = %for.body
++  %7 = load i32, ptr %i, align 4, !tbaa !6
++  %inc = add nsw i32 %7, 1
++  store i32 %inc, ptr %i, align 4, !tbaa !6
++  br label %for.cond, !llvm.loop !14
++
++for.end:                                          ; preds = %for.cond
++  %8 = load double, ptr %tmp, align 8, !tbaa !12
++  %cmp2 = fcmp ole double %8, 1.000000e+01
++  br i1 %cmp2, label %if.then, label %if.else
++
++if.then:                                          ; preds = %for.end
++  br label %if.end
++
++if.else:                                          ; preds = %for.end
++  call void @__assert_fail(ptr noundef @.str, ptr noundef @.str.1, i32 noundef 11, ptr noundef @__PRETTY_FUNCTION__.test) #7
++  unreachable
++
++if.end:                                           ; preds = %if.then
++  %9 = load ptr, ptr %flush, align 8, !tbaa !10
++  call void @free(ptr noundef %9) #5
++  call void @llvm.lifetime.end.p0(i64 8, ptr %tmp) #5
++  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #5
++  call void @llvm.lifetime.end.p0(i64 8, ptr %flush) #5
++  call void @llvm.lifetime.end.p0(i64 4, ptr %cs) #5
++  ret void
++}
++
++; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
++declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
++
++; Function Attrs: nounwind allocsize(0,1)
++declare noalias ptr @calloc(i64 noundef, i64 noundef) #2
++
++; Function Attrs: noreturn nounwind
++declare void @__assert_fail(ptr noundef, ptr noundef, i32 noundef, ptr noundef) #3
++
++; Function Attrs: nounwind
++declare void @free(ptr noundef) #4
++
++; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
++declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
++
++attributes #0 = { nounwind uwtable "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,-fmv" }
++attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
++attributes #2 = { nounwind allocsize(0,1) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,-fmv" }
++attributes #3 = { noreturn nounwind "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,-fmv" }
++attributes #4 = { nounwind "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,-fmv" }
++attributes #5 = { nounwind }
++attributes #6 = { nounwind allocsize(0,1) }
++attributes #7 = { noreturn nounwind }
++
++!llvm.module.flags = !{!0, !1, !2, !3, !4}
++!llvm.ident = !{!5}
++
++!0 = !{i32 1, !"wchar_size", i32 4}
++!1 = !{i32 8, !"PIC Level", i32 2}
++!2 = !{i32 7, !"PIE Level", i32 2}
++!3 = !{i32 7, !"uwtable", i32 2}
++!4 = !{i32 7, !"frame-pointer", i32 1}
++!5 = !{!"Huawei BiSheng Compiler clang version 18.0.0 (ssh://git@codehub-dg-y.huawei.com:2222/CompilerKernel/BiShengKernel/BiSheng.git 026024071a7fb66b26b65fb81da702cc5f0cf405)"}
++!6 = !{!7, !7, i64 0}
++!7 = !{!"int", !8, i64 0}
++!8 = !{!"omnipotent char", !9, i64 0}
++!9 = !{!"Simple C/C++ TBAA"}
++!10 = !{!11, !11, i64 0}
++!11 = !{!"any pointer", !8, i64 0}
++!12 = !{!13, !13, i64 0}
++!13 = !{!"double", !8, i64 0}
++!14 = distinct !{!14, !15}
++!15 = !{!"llvm.loop.mustprogress"}
+diff --git a/llvm/test/AutoTuning/BaselineConfig/apply_baseline_config.ll b/llvm/test/AutoTuning/BaselineConfig/apply_baseline_config.ll
+new file mode 100644
+index 000000000000..f905208a2f3b
+--- /dev/null
++++ b/llvm/test/AutoTuning/BaselineConfig/apply_baseline_config.ll
+@@ -0,0 +1,11 @@
++; The purpose is to test the baseline IR is the same as the 1st iteration of
++; autotuning process with --use-baseline-config enabled.
++; RUN: rm %t.baseline %t.firstIt -f
++; RUN: opt -O3 %S/Inputs/test.ll -o %t.baseline
++; RUN: opt -O3 %S/Inputs/test.ll -o %t.firstIt_baseline \
++; RUN:     -auto-tuning-input=%S/Inputs/autotune_datadir/baseline_config.yaml
++; RUN: cmp %t.firstIt_baseline %t.baseline
++
++; RUN: opt -O3 %S/Inputs/test.ll -o %t.firstIt_random \
++; RUN:     -auto-tuning-input=%S/Inputs/autotune_datadir/random_config.yaml
++; RUN: not cmp %t.firstIt_random %t.baseline
+diff --git a/llvm/test/AutoTuning/BaselineConfig/opp.ll b/llvm/test/AutoTuning/BaselineConfig/opp.ll
+new file mode 100644
+index 000000000000..b2897316fc22
+--- /dev/null
++++ b/llvm/test/AutoTuning/BaselineConfig/opp.ll
+@@ -0,0 +1,67 @@
++; REQUIRES: asserts
++; RUN: rm %t.callsite_opp -rf
++; RUN: opt %s -O3 -debug-only=inline -disable-output -S 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=DEFAULT
++; RUN: opt %s -O3 -auto-tuning-opp=%t.callsite_opp -disable-output -S 2>&1
++; RUN: FileCheck %s --input-file %t.callsite_opp/opp.ll.yaml -check-prefix=AUTOTUNE
++
++@a = global i32 4
++
++; Function Attrs: nounwind readnone uwtable
++define i32 @simpleFunction(i32 %a) #0 {
++entry:
++  call void @extern()
++  %a1 = load volatile i32, i32* @a
++  %x1 = add i32 %a1,  %a1
++  %a2 = load volatile i32, i32* @a
++  %x2 = add i32 %x1, %a2
++  %a3 = load volatile i32, i32* @a
++  %x3 = add i32 %x2, %a3
++  %a4 = load volatile i32, i32* @a
++  %x4 = add i32 %x3, %a4
++  %a5 = load volatile i32, i32* @a
++  %x5 = add i32 %x4, %a5
++  %a6 = load volatile i32, i32* @a
++  %x6 = add i32 %x5, %a6
++  %a7 = load volatile i32, i32* @a
++  %x7 = add i32 %x6, %a6
++  %a8 = load volatile i32, i32* @a
++  %x8 = add i32 %x7, %a8
++  %a9 = load volatile i32, i32* @a
++  %x9 = add i32 %x8, %a9
++  %a10 = load volatile i32, i32* @a
++  %x10 = add i32 %x9, %a10
++  %a11 = load volatile i32, i32* @a
++  %x11 = add i32 %x10, %a11
++  %a12 = load volatile i32, i32* @a
++  %x12 = add i32 %x11, %a12
++  %add = add i32 %x12, %a
++  ret i32 %add
++}
++
++; Function Attrs: nounwind readnone uwtable
++define i32 @bar(i32 %a) #0 {
++entry:
++  %0 = tail call i32 @simpleFunction(i32 6)
++  ret i32 %0
++}
++
++declare void @extern()
++
++attributes #0 = { nounwind readnone uwtable }
++attributes #1 = { nounwind cold readnone uwtable }
++
++
++; NOTE: Need to make sure the function inling have the same behaviour as O3 and
++;       'BaselineConfig'
++; DEFAULT: Inlining calls in: bar
++; DEFAULT: Inlining (cost=115, threshold=375), Call:   %0 = tail call i32 @simpleFunction(i32 6)
++
++; AUTOTUNE:      Pass:            inline
++; AUTOTUNE-NEXT: Name:            simpleFunction
++; AUTOTUNE-NEXT: Function:        bar
++; AUTOTUNE-NEXT: CodeRegionType:  callsite
++; AUTOTUNE-NEXT: CodeRegionHash:  {{[0-9]+}}
++; AUTOTUNE-NEXT: DynamicConfigs:  { ForceInline: [ 0, 1 ] }
++; AUTOTUNE-NEXT: BaselineConfig:  { ForceInline: '1' }
++; AUTOTUNE-NEXT: Invocation:      0
+diff --git a/llvm/test/AutoTuning/CodeRegionFilter/function-filtering.ll b/llvm/test/AutoTuning/CodeRegionFilter/function-filtering.ll
+new file mode 100644
+index 000000000000..13acafae6fc4
+--- /dev/null
++++ b/llvm/test/AutoTuning/CodeRegionFilter/function-filtering.ll
+@@ -0,0 +1,62 @@
++; REQUIRES: asserts
++
++; RUN: rm -rf %t.filter
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.filter -auto-tuning-type-filter=CallSite,Loop --disable-output
++; RUN: FileCheck %s --input-file %t.filter/function-filtering.ll.yaml -check-prefix=DEFAULT
++
++; RUN: rm -rf %t.filter
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.filter -auto-tuning-type-filter=CallSite,Loop \
++; RUN:     -auto-tuning-function-filter=foo --disable-output
++; RUN: FileCheck %s --input-file %t.filter/function-filtering.ll.yaml -check-prefix=FILTER_FOO
++
++; RUN: rm -rf %t.filter
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.filter -auto-tuning-type-filter=CallSite,Loop \
++; RUN:     -auto-tuning-function-filter=bar --disable-output
++; RUN: FileCheck %s --input-file %t.filter/function-filtering.ll.yaml -check-prefix=FILTER_BAR
++
++; RUN: rm -rf %t.filter
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.filter -auto-tuning-type-filter=CallSite,Loop \
++; RUN:     -auto-tuning-function-filter=dummy -debug-only=autotuning | \
++; RUN:     FileCheck %s -check-prefix=FILTER_DUMMY
++
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++
++for.end:                                          ; preds = %for.body
++  ret void
++}
++
++define void @bar(i32* nocapture %a) {
++entry:
++  call void @foo(i32* %a)
++  ret void
++}
++
++; DEFAULT: --- !AutoTuning
++; DEFAULT: --- !AutoTuning
++
++; FILTER_FOO: --- !AutoTuning
++; FILTER_FOO: Function:        foo
++; FILTER_FOO-NOT: --- !AutoTuning
++
++; FILTER_BAR: --- !AutoTuning
++; FILTER_BAR: Function:        bar
++; FILTER_BAR-NOT: --- !AutoTuning
++
++; FILTER_DUMMY-NOT: --- !AutoTuning
++; FILTER_DUMMY-NOT: --- !AutoTuning
+diff --git a/llvm/test/AutoTuning/Error/Inputs/invalid-format.yaml b/llvm/test/AutoTuning/Error/Inputs/invalid-format.yaml
+new file mode 100644
+index 000000000000..9c203e58f0ab
+--- /dev/null
++++ b/llvm/test/AutoTuning/Error/Inputs/invalid-format.yaml
+@@ -0,0 +1,3 @@
++<inpus>
++  <input>this is a xml file</input>
++</input>
+diff --git a/llvm/test/AutoTuning/Error/Inputs/template.yaml b/llvm/test/AutoTuning/Error/Inputs/template.yaml
+new file mode 100644
+index 000000000000..1f02b52ffb38
+--- /dev/null
++++ b/llvm/test/AutoTuning/Error/Inputs/template.yaml
+@@ -0,0 +1,10 @@
++--- !AutoTuning
++Pass:            pass
++Name:            for.body
++Function:        foo
++CodeRegionType:  loop
++CodeRegionHash:  0
++Args:
++  - UnrollCount: 2
++  - PassOrder: [test, test2]
++...
+diff --git a/llvm/test/AutoTuning/Error/file-not-found-error.ll b/llvm/test/AutoTuning/Error/file-not-found-error.ll
+new file mode 100644
+index 000000000000..6a364239a271
+--- /dev/null
++++ b/llvm/test/AutoTuning/Error/file-not-found-error.ll
+@@ -0,0 +1,29 @@
++; RUN: rm %t.non-existing.yaml -rf
++; RUN: not opt  %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.non-existing.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR
++
++; UNSUPPORTED: windows
++
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++
++for.end:                                          ; preds = %for.body
++  ret void
++}
++
++; check if error massage is shown properly when input yaml is not found
++;
++; ERROR: Error parsing auto-tuning input.
++; ERROR: No such file or directory
+diff --git a/llvm/test/AutoTuning/Error/invalid-yaml-error.ll b/llvm/test/AutoTuning/Error/invalid-yaml-error.ll
+new file mode 100644
+index 000000000000..bfc8784c4ea4
+--- /dev/null
++++ b/llvm/test/AutoTuning/Error/invalid-yaml-error.ll
+@@ -0,0 +1,27 @@
++; RUN: not opt  %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%S/Inputs/invalid-format.yaml  2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR
++
++; UNSUPPORTED: windows
++
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++
++for.end:                                          ; preds = %for.body
++  ret void
++}
++
++; check if error massage is shown properly when input yaml is in invalid format
++;
++; ERROR: error: YAML:1:1: error: document root is not of mapping type.
+diff --git a/llvm/test/AutoTuning/Error/malformed-input-error.ll b/llvm/test/AutoTuning/Error/malformed-input-error.ll
+new file mode 100644
+index 000000000000..0b73c3195503
+--- /dev/null
++++ b/llvm/test/AutoTuning/Error/malformed-input-error.ll
+@@ -0,0 +1,136 @@
++; Check if error messages are shown properly for malformed YAML files.
++
++; Missing Pass Field
++; RUN: rm %t.missing-pass.yaml -rf
++; RUN: sed 's#Pass:            pass##g' %S/Inputs/template.yaml > %t.missing-pass.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-pass.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-FIELD
++
++; Missing Pass Value
++; RUN: rm %t.missing-value-pass.yaml -rf
++; RUN: sed 's#pass##g' %S/Inputs/template.yaml > %t.missing-value-pass.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-value-pass.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-PASS-VALUE
++
++; Missing Name Field
++; RUN: rm %t.missing-name.yaml -rf
++; RUN: sed 's#Name:            for.body##g' %S/Inputs/template.yaml > %t.missing-name.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-name.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-NAME-FIELD
++
++; Missing Name Value
++; RUN: rm %t.missing-value-name.yaml -rf
++; RUN: sed 's#for.body##g' %S/Inputs/template.yaml > %t.missing-value-name.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-value-name.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-NAME-VALUE
++
++; Missing Function Field
++; RUN: rm %t.missing-function.yaml -rf
++; RUN: sed 's#Function:        foo##g' %S/Inputs/template.yaml > %t.missing-function.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' -auto-tuning-input=%t.missing-function.yaml 2>&1 | FileCheck %s -check-prefix=ERROR-FUNCTION-FIELD
++
++; Missing Function Value
++; RUN: rm %t.missing-value-func.yaml -rf
++; RUN: sed 's#foo##g' %S/Inputs/template.yaml > %t.missing-value-func.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-value-func.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-FUNC-VALUE
++
++; Missing CodeRegionType Field
++; RUN: rm %t.missing-type.yaml -rf
++; RUN: sed 's#CodeRegionType:  loop##g' %S/Inputs/template.yaml > %t.missing-type.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-type.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-CODE-REGION-TYPE-FIELD
++
++; Missing CodeRegionType Value
++; RUN: rm %t.missing-value-type.yaml -rf
++; RUN: sed 's#loop##g' %S/Inputs/template.yaml > %t.missing-value-type.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-value-type.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-CODE-REGION-TYPE-VALUE
++
++; Invalid CodeRegionType Value
++; RUN: rm %t.invalid-value-type.yaml -rf
++; RUN: sed 's#loop#error-type#g' %S/Inputs/template.yaml > %t.invalid-value-type.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.invalid-value-type.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-CODE-REGION-TYPE-INVALID
++
++; Missing Param Name
++; RUN: rm %t.missing-param-name.yaml -rf
++; RUN: sed 's#UnrollCount##g' %S/Inputs/template.yaml > %t.missing-param-name.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-param-name.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-PARAM-NAME
++
++; Missing Param Value
++; RUN: rm %t.missing-value-param.yaml -rf
++; RUN: sed 's#2##g' %S/Inputs/template.yaml > %t.missing-value-param.yaml
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.missing-value-param.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=ERROR-PARAM-VALUE
++
++; Empty Param List
++; RUN: rm %t.empty-value-param-list.yaml -rf
++; RUN: sed 's#\[test, test2\]#\[\]#g' %S/Inputs/template.yaml > %t.empty-value-param-list.yaml
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.empty-value-param-list.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=VALID
++
++; UNSUPPORTED: windows
++
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++
++for.end:                                          ; preds = %for.body
++  ret void
++}
++
++; check if error massage is shown properly for malformed YAML input files.
++;
++
++; ERROR-FIELD: error: CodeRegionHash, CodeRegionType, or Pass missing.
++
++; ERROR-NAME-FIELD: error: Remark Name expected; enable -autotuning-omit-metadata.
++
++; ERROR-FUNCTION-FIELD: error: Remark Function Name expected; enable -autotuning-omit-metadata.
++
++; ERROR-PASS-VALUE: error: YAML:2:1: error: expected a value of scalar type.
++; ERROR-PASS-VALUE: Pass:
++
++; ERROR-NAME-VALUE: error: YAML:3:1: error: expected a value of scalar type.
++; ERROR-NAME-VALUE: Name:
++
++; ERROR-FUNC-VALUE: error: YAML:4:1: error: expected a value of scalar type.
++; ERROR-FUNC-VALUE: Function:
++
++; ERROR-CODE-REGION-TYPE-FIELD: CodeRegionHash, CodeRegionType, or Pass missing.
++
++; ERROR-CODE-REGION-TYPE-VALUE: error: YAML:5:1: error: expected a value of scalar type.
++; ERROR-CODE-REGION-TYPE-VALUE: CodeRegionType:
++
++; ERROR-CODE-REGION-TYPE-INVALID: Unsupported CodeRegionType:error-type
++
++; ERROR-PARAM-NAME: error: YAML:8:5: error: argument key is missing.
++; ERROR-PARAM-NAME: - : 2
++
++; ERROR-PARAM-VALUE: error: YAML:8:5: error: expected a value of scalar type.
++; ERROR-PARAM-VALUE: - UnrollCount:
++
++; VALID-NOT: -auto-tuning-input=(input file) option failed.
+diff --git a/llvm/test/AutoTuning/Error/output-error.ll b/llvm/test/AutoTuning/Error/output-error.ll
+new file mode 100644
+index 000000000000..61ffba50924b
+--- /dev/null
++++ b/llvm/test/AutoTuning/Error/output-error.ll
+@@ -0,0 +1,28 @@
++; RUN: rm %t.opp -rf; touch %t.opp
++; RUN: not opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.opp  2>&1 | FileCheck %s -check-prefix=ERROR-OPP
++
++; UNSUPPORTED: windows
++
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++
++for.end:                                          ; preds = %for.body
++  ret void
++}
++
++; check if error massage is shown properly when output files cannot be created
++;
++; ERROR-OPP: Error generating auto-tuning opportunities.
++; ERROR-OPP: error: Not a directory
+diff --git a/llvm/test/AutoTuning/Error/valid-input.ll b/llvm/test/AutoTuning/Error/valid-input.ll
+new file mode 100644
+index 000000000000..dae90cdbe408
+--- /dev/null
++++ b/llvm/test/AutoTuning/Error/valid-input.ll
+@@ -0,0 +1,27 @@
++; RUN: opt  %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%S/Inputs/template.yaml  2>&1 | \
++; RUN:     FileCheck %s -check-prefix=VALID
++; UNSUPPORTED: windows
++
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++
++for.end:                                          ; preds = %for.body
++  ret void
++}
++
++; check if error massage is shown properly when the input is valid
++;
++
++; VALID-NOT: -auto-tuning-input=(input file) option failed.
+diff --git a/llvm/test/AutoTuning/IncrementalCompilation/Inputs/template.yaml b/llvm/test/AutoTuning/IncrementalCompilation/Inputs/template.yaml
+new file mode 100644
+index 000000000000..a7d390be63e7
+--- /dev/null
++++ b/llvm/test/AutoTuning/IncrementalCompilation/Inputs/template.yaml
+@@ -0,0 +1,9 @@
++--- !AutoTuning
++Pass:            [dummy-pass]
++CodeRegionType:  [dummy-type]
++Name:            foo
++DebugLoc:        { File: [dummy-file], Line: 0, Column: 0 }
++Function:        foo
++CodeRegionHash:  0
++Invocation:      0
++...
+diff --git a/llvm/test/AutoTuning/IncrementalCompilation/inc-compile-parse-input.ll b/llvm/test/AutoTuning/IncrementalCompilation/inc-compile-parse-input.ll
+new file mode 100644
+index 000000000000..b9dc81089d40
+--- /dev/null
++++ b/llvm/test/AutoTuning/IncrementalCompilation/inc-compile-parse-input.ll
+@@ -0,0 +1,103 @@
++; REQUIRES: asserts
++; RUN: rm %t.output -rf
++; RUN: rm %t.inc_compile.yaml -rf
++; RUN: sed 's#\[dummy-pass\]#inline#g' %S/Inputs/template.yaml > %t.temp.yaml
++; RUN: sed 's#\[dummy-type\]#callsite#g' %t.temp.yaml > %t.temp2.yaml
++; RUN: sed 's#\[dummy-file\]#%s#g' %t.temp2.yaml > %t.inc_compile.yaml
++; RUN: opt -O3 %s -auto-tuning-input=%t.inc_compile.yaml \
++; RUN:     -auto-tuning-compile-mode=CoarseGrain -print-after-all \
++; RUN:     -debug-only=autotuning-compile \
++; RUN:     -o %t.output 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=COARSEGRAIN
++
++; RUN: rm %t.output -rf
++; RUN: rm %t.inc_compile.yaml -rf
++; RUN: sed 's#\[dummy-pass\]#inline#g' %S/Inputs/template.yaml > %t.temp.yaml
++; RUN: sed 's#\[dummy-type\]#callsite#g' %t.temp.yaml > %t.temp2.yaml
++; RUN: sed 's#\[dummy-file\]#%s#g' %t.temp2.yaml > %t.inc_compile.yaml
++; RUN: opt -O3 %s -auto-tuning-input=%t.inc_compile.yaml \
++; RUN:     -auto-tuning-compile-mode=FineGrain -print-after-all \
++; RUN:     -debug-only=autotuning-compile \
++; RUN:     -o %t.output 2>&1 | \
++; RUN:     FileCheck %s -check-prefixes=FINEGRAIN-1,FINEGRAIN-INLINE
++
++; RUN: rm %t.output -rf
++; RUN: rm %t.inc_compile.yaml -rf
++; RUN: sed 's#\[dummy-pass\]#loop-unroll#g' %S/Inputs/template.yaml > %t.temp.yaml
++; RUN: sed 's#\[dummy-type\]#loop#g' %t.temp.yaml > %t.temp2.yaml
++; RUN: sed 's#\[dummy-file\]#%s#g' %t.temp2.yaml > %t.inc_compile.yaml
++; RUN: opt -O3 %s -auto-tuning-input=%t.inc_compile.yaml \
++; RUN:     -auto-tuning-compile-mode=FineGrain -print-after-all \
++; RUN:     -debug-only=autotuning-compile \
++; RUN:     -o %t.output 2>&1 | \
++; RUN:     FileCheck %s -check-prefixes=FINEGRAIN-1,FINEGRAIN-2,FINEGRAIN-UNROLL
++
++; ModuleID = 'test.c'
++source_filename = "test.c"
++target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
++target triple = "aarch64-unknown-linux-gnu"
++
++; Function Attrs: argmemonly nofree norecurse nosync nounwind uwtable
++define dso_local i32 @test(i32* nocapture noundef %a, i32* nocapture noundef readonly %b, i32 noundef %size) local_unnamed_addr #0 {
++entry:
++  %cmp11 = icmp sgt i32 %size, 0
++  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
++
++for.body.preheader:                               ; preds = %entry
++  %wide.trip.count = zext i32 %size to i64
++  br label %for.body
++
++for.cond.cleanup:                                 ; preds = %for.body, %entry
++  ret i32 undef
++
++for.body:                                         ; preds = %for.body.preheader, %for.body
++  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
++  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %1 = load i32, i32* %arrayidx2, align 4
++  %add = add nsw i32 %1, %0
++  store i32 %add, i32* %arrayidx2, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
++  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
++}
++
++attributes #0 = { argmemonly nofree norecurse nosync nounwind uwtable "frame-pointer"="non-leaf" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon,+v8a" }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4, !5, !6, !7, !8}
++!llvm.ident = !{!9}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Huawei BiSheng Compiler clang version 12.0.0 (1c7b819ced36)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
++!1 = !DIFile(filename: "test.c", directory: "/home/m00629332/code/autoTuner")
++!2 = !{}
++!3 = !{i32 2, !"Debug Info Version", i32 3}
++!4 = !{i32 1, !"wchar_size", i32 4}
++!5 = !{i32 1, !"branch-target-enforcement", i32 0}
++!6 = !{i32 1, !"sign-return-address", i32 0}
++!7 = !{i32 1, !"sign-return-address-all", i32 0}
++!8 = !{i32 1, !"sign-return-address-with-bkey", i32 0}
++!9 = !{!"Huawei BiSheng Compiler clang version 12.0.0 (1c7b819ced36)"}
++!10 = distinct !DISubprogram(name: "dummy", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!11 = !DISubroutineType(types: !2)
++!12 = !DILocation(line: 2, column: 5, scope: !10)
++
++; COARSEGRAIN: AutoTuningCompile: Deciding to enable/disable optimization of module/functions. Pass: start
++; COARSEGRAIN-NEXT: AutoTuningCompile: No change in opt pipeline for Basic/CoarseGrain incremental compilation mode.
++; COARSEGRAIN-NOT: Skip pass {{.*}}: True
++
++; FINEGRAIN-1: AutoTuningCompile: Deciding to enable/disable optimization of module/functions. Pass: start
++; FINEGRAIN-1-NEXT: AutoTuningCompile: SkipPasses enabled.
++; FINEGRAIN-1-NOT: Skip pass {{.*}}: False
++; FINEGRAIN-1: AutoTuningCompile: Deciding to enable/disable optimization of module/functions. Pass: inline
++; FINEGRAIN-INLINE: AutoTuningCompile: SkipPasses disabled.
++; FINEGRAIN-INLINE: Skip pass 'InlinerPass': False
++; FINEGRAIN-INLINE-NEXT: *** IR Dump After InlinerPass
++; FINEGRAIN-INLINE-NOT: Skip pass {{.*}}: True
++
++; FINEGRAIN-2: AutoTuningCompile: Old decision (SkipPasses = True ) continued.
++; FINEGRAIN-2-NOT: Skip pass {{.*}}: False
++; FINEGRAIN-2: AutoTuningCompile: Deciding to enable/disable optimization of module/functions. Pass: loop-unroll
++; FINEGRAIN-UNROLL: AutoTuningCompile: SkipPasses disabled.
++; FINEGRAIN-UNROLL-NOT: Skip pass {{.*}}: True
+diff --git a/llvm/test/AutoTuning/Inline/Inputs/template.yaml b/llvm/test/AutoTuning/Inline/Inputs/template.yaml
+new file mode 100644
+index 000000000000..e04612183d1f
+--- /dev/null
++++ b/llvm/test/AutoTuning/Inline/Inputs/template.yaml
+@@ -0,0 +1,9 @@
++--- !AutoTuning
++Pass:            inline
++Name:            simpleFunction-entry
++Function:        bar
++CodeRegionType:  callsite
++CodeRegionHash:  5550568187071847048
++Args:
++  - ForceInline: [force-inline]
++...
+diff --git a/llvm/test/AutoTuning/Inline/Inputs/template_no_metadata.yaml b/llvm/test/AutoTuning/Inline/Inputs/template_no_metadata.yaml
+new file mode 100644
+index 000000000000..9fc88f56d6bc
+--- /dev/null
++++ b/llvm/test/AutoTuning/Inline/Inputs/template_no_metadata.yaml
+@@ -0,0 +1,7 @@
++--- !AutoTuning
++Pass:            inline
++CodeRegionType:  callsite
++CodeRegionHash:  5550568187071847048
++Args:
++  - ForceInline: [force-inline]
++...
+diff --git a/llvm/test/AutoTuning/Inline/duplicate-calls.ll b/llvm/test/AutoTuning/Inline/duplicate-calls.ll
+new file mode 100644
+index 000000000000..ad32262ad044
+--- /dev/null
++++ b/llvm/test/AutoTuning/Inline/duplicate-calls.ll
+@@ -0,0 +1,96 @@
++; RUN: rm %t.duplicate_calls -rf
++; RUN: opt %s -S -passes='cgscc(inline)' -auto-tuning-opp=%t.duplicate_calls \
++; RUN:     -auto-tuning-type-filter=CallSite --disable-output
++; RUN: FileCheck %s --input-file %t.duplicate_calls/duplicate-calls.ll.yaml
++
++; ModuleID = 'duplicate-calls.c'
++source_filename = "duplicate-calls.c"
++target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
++target triple = "aarch64-unknown-linux-gnu"
++
++; Function Attrs: nounwind uwtable
++define dso_local void @bar(i32* nocapture %result, i32* %cfb, i32 %bytes) local_unnamed_addr #0 !dbg !10 {
++entry:
++  %call = tail call i32 @test(i32* %cfb, i32 %bytes) #1, !dbg !12
++  store i32 %call, i32* %result, align 4, !dbg !13, !tbaa !14
++  ret void, !dbg !18
++}
++
++declare dso_local i32 @test(i32*, i32) local_unnamed_addr #0
++
++; Function Attrs: nounwind uwtable
++define dso_local void @foo(i32* %cfb, i32* readnone %saved, i32* nocapture %result, i32 %bytes) local_unnamed_addr #0 !dbg !19 {
++entry:
++  %tobool.not = icmp eq i32* %cfb, null, !dbg !20
++  br i1 %tobool.not, label %if.else, label %if.then.split, !dbg !20
++
++if.then.split:                                    ; preds = %entry
++  tail call void @bar(i32* %result, i32* nonnull %cfb, i32 %bytes), !dbg !21
++  br label %return, !dbg !22
++
++if.else:                                          ; preds = %entry
++  %tobool1.not = icmp eq i32* %saved, null, !dbg !23
++  br i1 %tobool1.not, label %if.else.split, label %return, !dbg !23
++
++if.else.split:                                    ; preds = %if.else
++  tail call void @bar(i32* %result, i32* null, i32 %bytes), !dbg !21
++  br label %return, !dbg !23
++
++return:                                           ; preds = %if.then.split, %if.else.split, %if.else
++  ret void, !dbg !24
++}
++
++attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { nounwind }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4, !5, !6, !7, !8}
++!llvm.ident = !{!9}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Huawei BiSheng Compiler clang version 12.0.0 (clang-0d5d71fe6c22 flang-8b17fc131076)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
++!1 = !DIFile(filename: "duplicate-calls.c", directory: "/home/m00629332/benchmarks/cBench/source/security_pgp_d/src")
++!2 = !{}
++!3 = !{i32 2, !"Debug Info Version", i32 3}
++!4 = !{i32 1, !"wchar_size", i32 4}
++!5 = !{i32 1, !"branch-target-enforcement", i32 0}
++!6 = !{i32 1, !"sign-return-address", i32 0}
++!7 = !{i32 1, !"sign-return-address-all", i32 0}
++!8 = !{i32 1, !"sign-return-address-with-bkey", i32 0}
++!9 = !{!"Huawei BiSheng Compiler clang version 12.0.0 (clang-0d5d71fe6c22 flang-8b17fc131076)"}
++!10 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !11, scopeLine: 8, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!11 = !DISubroutineType(types: !2)
++!12 = !DILocation(line: 10, column: 16, scope: !10)
++!13 = !DILocation(line: 10, column: 14, scope: !10)
++!14 = !{!15, !15, i64 0}
++!15 = !{!"int", !16, i64 0}
++!16 = !{!"omnipotent char", !17, i64 0}
++!17 = !{!"Simple C/C++ TBAA"}
++!18 = !DILocation(line: 14, column: 1, scope: !10)
++!19 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 17, type: !11, scopeLine: 18, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!20 = !DILocation(line: 22, column: 6, scope: !19)
++!21 = !DILocation(line: 27, column: 2, scope: !19)
++!22 = !DILocation(line: 23, column: 3, scope: !19)
++!23 = !DILocation(line: 24, column: 11, scope: !19)
++!24 = !DILocation(line: 28, column: 1, scope: !19)
++
++; CHECK: --- !AutoTuning
++; CHECK-NEXT: Pass:            inline
++; CHECK-NEXT: Name:            bar-if.then.split
++; CHECK-NEXT: DebugLoc:        { File: duplicate-calls.c, Line: 27, Column: 2 }
++; CHECK-NEXT: Function:        foo
++; CHECK-NEXT: CodeRegionType:  callsite
++; CHECK-NEXT: CodeRegionHash:
++; CHECK-NEXT: DynamicConfigs:  { ForceInline: [ 0, 1 ] }
++; CHECK-NEXT: BaselineConfig:  { ForceInline: '1' }
++; CHECK-NEXT: Invocation:      0
++; CHECK-NEXT: ...
++; CHECK-NEXT: --- !AutoTuning
++; CHECK-NEXT: Pass:            inline
++; CHECK-NEXT: Name:            bar-if.else.split
++; CHECK-NEXT: DebugLoc:        { File: duplicate-calls.c, Line: 27, Column: 2 }
++; CHECK-NEXT: Function:        foo
++; CHECK-NEXT: CodeRegionType:  callsite
++; CHECK-NEXT: CodeRegionHash:
++; CHECK-NEXT: DynamicConfigs:  { ForceInline: [ 0, 1 ] }
++; CHECK-NEXT: BaselineConfig:  { ForceInline: '1' }
++; CHECK-NEXT: Invocation:      0
+diff --git a/llvm/test/AutoTuning/Inline/force-inline.ll b/llvm/test/AutoTuning/Inline/force-inline.ll
+new file mode 100644
+index 000000000000..cedfc8df3483
+--- /dev/null
++++ b/llvm/test/AutoTuning/Inline/force-inline.ll
+@@ -0,0 +1,84 @@
++; REQUIRES: asserts
++; RUN: opt < %s -passes=inline -debug-only=inline -disable-output -S 2>&1 | FileCheck %s -check-prefix=DEFAULT
++; simpleFunction will be inlined with the default behavior.
++
++; RUN: rm %t.force-inline.yaml -rf
++; RUN: sed 's#\[force-inline\]#true#g' %S/Inputs/template.yaml > %t.force-inline.yaml
++; RUN: opt %s -passes=inline -debug-only=inline -disable-output -S \
++; RUN:     -auto-tuning-input=%t.force-inline.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=FORCE-INLINE
++; Test with ForceInline=true;
++
++; RUN: rm %t.force-inline.yaml -rf
++; RUN: sed 's#\[force-inline\]#true#g' %S/Inputs/template_no_metadata.yaml > %t.force-inline.yaml
++; RUN: opt %s -passes=inline -S -auto-tuning-input=%t.force-inline.yaml \
++; RUN:     -debug-only=inline -disable-output -auto-tuning-omit-metadata 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=FORCE-INLINE
++; Test with ForceInline=true;
++
++; RUN: rm %t.no-inline.yaml -rf
++; RUN: sed 's#\[force-inline\]#false#g' %S/Inputs/template.yaml > %t.no-inline.yaml
++; RUN: opt %s -passes=inline -debug-only=inline -disable-output -S \
++; RUN:     -auto-tuning-input=%t.no-inline.yaml 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=NO-INLINE
++; Test with ForceInline=false;
++
++; RUN: rm %t.no-inline.yaml -rf
++; RUN: sed 's#\[force-inline\]#false#g' %S/Inputs/template_no_metadata.yaml > %t.no-inline.yaml
++; RUN: opt %s -passes='cgscc(inline)' -debug-only=inline -disable-output -S \
++; RUN:     -auto-tuning-input=%t.no-inline.yaml -auto-tuning-omit-metadata 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=NO-INLINE
++; Test with ForceInline=false;
++
++@a = global i32 4
++
++; Function Attrs: nounwind readnone uwtable
++define i32 @simpleFunction(i32 %a) #0 {
++entry:
++  call void @extern()
++  %a1 = load volatile i32, i32* @a
++  %x1 = add i32 %a1,  %a1
++  %a2 = load volatile i32, i32* @a
++  %x2 = add i32 %x1, %a2
++  %a3 = load volatile i32, i32* @a
++  %x3 = add i32 %x2, %a3
++  %a4 = load volatile i32, i32* @a
++  %x4 = add i32 %x3, %a4
++  %a5 = load volatile i32, i32* @a
++  %x5 = add i32 %x4, %a5
++  %a6 = load volatile i32, i32* @a
++  %x6 = add i32 %x5, %a6
++  %a7 = load volatile i32, i32* @a
++  %x7 = add i32 %x6, %a6
++  %a8 = load volatile i32, i32* @a
++  %x8 = add i32 %x7, %a8
++  %a9 = load volatile i32, i32* @a
++  %x9 = add i32 %x8, %a9
++  %a10 = load volatile i32, i32* @a
++  %x10 = add i32 %x9, %a10
++  %a11 = load volatile i32, i32* @a
++  %x11 = add i32 %x10, %a11
++  %a12 = load volatile i32, i32* @a
++  %x12 = add i32 %x11, %a12
++  %add = add i32 %x12, %a
++  ret i32 %add
++}
++
++; Function Attrs: nounwind readnone uwtable
++define i32 @bar(i32 %a) #0 {
++entry:
++  %0 = tail call i32 @simpleFunction(i32 6)
++  ret i32 %0
++}
++
++declare void @extern()
++
++attributes #0 = { nounwind readnone uwtable }
++attributes #1 = { nounwind cold readnone uwtable }
++
++; DEFAULT: Inlining (cost=120, threshold=337)
++; DEFAULT-SAME: simpleFunction
++; FORCE-INLINE: Inlining (cost=always): Force inlined by auto-tuning
++; FORCE-INLINE-SAME: simpleFunction
++; NO-INLINE: NOT Inlining (cost=never): Force non-inlined by auto-tuning
++; NO-INLINE-SAME: simpleFunction
+diff --git a/llvm/test/AutoTuning/Inline/inline-attribute.ll b/llvm/test/AutoTuning/Inline/inline-attribute.ll
+new file mode 100644
+index 000000000000..50f583d0a51e
+--- /dev/null
++++ b/llvm/test/AutoTuning/Inline/inline-attribute.ll
+@@ -0,0 +1,85 @@
++; RUN: rm %t.inline_opp -rf
++; RUN: opt %s -S -passes='cgscc(inline)' -auto-tuning-opp=%t.inline_opp -auto-tuning-type-filter=CallSite --disable-output
++; RUN: FileCheck %s --input-file %t.inline_opp/inline-attribute.ll.yaml -check-prefix=TEST-1
++; RUN: FileCheck %s --input-file %t.inline_opp/inline-attribute.ll.yaml -check-prefix=TEST-2
++
++; ModuleID = 'inline.c'
++source_filename = "inline.c"
++target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
++target triple = "aarch64-unknown-linux-gnu"
++
++; Function Attrs: noinline norecurse nounwind readnone uwtable willreturn
++define dso_local i32 @mul(i32 %a) local_unnamed_addr #0 !dbg !10 {
++entry:
++  %mul = mul nsw i32 %a, %a, !dbg !12
++  ret i32 %mul, !dbg !13
++}
++
++; Function Attrs: alwaysinline nounwind uwtable
++define dso_local i32 @add(i32 %a) local_unnamed_addr #1 !dbg !14 {
++entry:
++  %add = shl nsw i32 %a, 1, !dbg !15
++  ret i32 %add, !dbg !16
++}
++
++; Function Attrs: nounwind uwtable
++define dso_local i32 @inc(i32 %a) local_unnamed_addr #2 !dbg !17 {
++entry:
++  %inc = add nsw i32 %a, 1, !dbg !18
++  ret i32 %inc, !dbg !19
++}
++
++; Function Attrs: nounwind uwtable
++define dso_local i32 @func(i32 %a) local_unnamed_addr #2 !dbg !20 {
++entry:
++  %call = call i32 @add(i32 %a), !dbg !21
++  %call1 = call i32 @mul(i32 %a), !dbg !22
++  %add = add nsw i32 %call, %call1, !dbg !23
++  %call2 = call i32 @inc(i32 %a), !dbg !24
++  %add3 = add nsw i32 %add, %call2, !dbg !25
++  ret i32 %add3, !dbg !26
++}
++
++attributes #0 = { noinline norecurse nounwind readnone uwtable willreturn "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { alwaysinline nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #2 = { nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4, !5, !6, !7, !8}
++!llvm.ident = !{!9}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Huawei Bisheng Compiler clang version 12.0.0 (729941c4adfa)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
++!1 = !DIFile(filename: "test.c", directory: "/home/m00629332/code/autoTuner/ir-hashing")
++!2 = !{}
++!3 = !{i32 2, !"Debug Info Version", i32 3}
++!4 = !{i32 1, !"wchar_size", i32 4}
++!5 = !{i32 1, !"branch-target-enforcement", i32 0}
++!6 = !{i32 1, !"sign-return-address", i32 0}
++!7 = !{i32 1, !"sign-return-address-all", i32 0}
++!8 = !{i32 1, !"sign-return-address-with-bkey", i32 0}
++!9 = !{!"Huawei Bisheng Compiler clang version 12.0.0 (729941c4adfa)"}
++!10 = distinct !DISubprogram(name: "mul", scope: !1, file: !1, line: 2, type: !11, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!11 = !DISubroutineType(types: !2)
++!12 = !DILocation(line: 3, column: 13, scope: !10)
++!13 = !DILocation(line: 3, column: 5, scope: !10)
++!14 = distinct !DISubprogram(name: "add", scope: !1, file: !1, line: 7, type: !11, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!15 = !DILocation(line: 8, column: 13, scope: !14)
++!16 = !DILocation(line: 8, column: 5, scope: !14)
++!17 = distinct !DISubprogram(name: "inc", scope: !1, file: !1, line: 11, type: !11, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!18 = !DILocation(line: 12, column: 12, scope: !17)
++!19 = !DILocation(line: 12, column: 5, scope: !17)
++!20 = distinct !DISubprogram(name: "func", scope: !1, file: !1, line: 15, type: !11, scopeLine: 15, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!21 = !DILocation(line: 16, column: 12, scope: !20)
++!22 = !DILocation(line: 16, column: 19, scope: !20)
++!23 = !DILocation(line: 16, column: 18, scope: !20)
++!24 = !DILocation(line: 16, column: 26, scope: !20)
++!25 = !DILocation(line: 16, column: 25, scope: !20)
++!26 = !DILocation(line: 16, column: 5, scope: !20)
++
++; TEST-1: Pass:            inline
++; TEST-1-NOT: Pass:            inline
++
++; TEST-2: Name:            inc
++; TEST-2-NEXT: DebugLoc:        { File: test.c, Line: 16, Column: 26 }
++; TEST-2-NEXT: Function:        func
++; TEST-2-NEXT: CodeRegionType:  callsite
+diff --git a/llvm/test/AutoTuning/Inline/opp.ll b/llvm/test/AutoTuning/Inline/opp.ll
+new file mode 100644
+index 000000000000..dfe1dac29476
+--- /dev/null
++++ b/llvm/test/AutoTuning/Inline/opp.ll
+@@ -0,0 +1,64 @@
++; RUN: rm %t.callsite_opp -rf
++; RUN: sed 's#\[number\]#25#g; s#\[func_name\]#ColdFunction#g' %S/Inputs/template.yaml > %t.template25.yaml
++; RUN: opt %s -passes=inline -S -auto-tuning-opp=%t.callsite_opp -auto-tuning-type-filter=CallSite
++
++; RUN: FileCheck  %s  --input-file %t.callsite_opp/opp.ll.yaml -check-prefix=CALLSITE
++
++@a = global i32 4
++
++declare void @extern()
++; Function Attrs: nounwind readnone uwtable
++define i32 @simpleFunction(i32 %a) #1 {
++entry:
++  call void @extern()
++  %a1 = load volatile i32, i32* @a
++  %x1 = add i32 %a1,  %a1
++  %a2 = load volatile i32, i32* @a
++  %x2 = add i32 %x1, %a2
++  %a3 = load volatile i32, i32* @a
++  %x3 = add i32 %x2, %a3
++  %a4 = load volatile i32, i32* @a
++  %x4 = add i32 %x3, %a4
++  %a5 = load volatile i32, i32* @a
++  %x5 = add i32 %x4, %a5
++  %a6 = load volatile i32, i32* @a
++  %x6 = add i32 %x5, %a6
++  %a7 = load volatile i32, i32* @a
++  %x7 = add i32 %x6, %a6
++  %a8 = load volatile i32, i32* @a
++  %x8 = add i32 %x7, %a8
++  %a9 = load volatile i32, i32* @a
++  %x9 = add i32 %x8, %a9
++  %a10 = load volatile i32, i32* @a
++  %x10 = add i32 %x9, %a10
++  %a11 = load volatile i32, i32* @a
++  %x11 = add i32 %x10, %a11
++  %a12 = load volatile i32, i32* @a
++  %x12 = add i32 %x11, %a12
++  %add = add i32 %x12, %a
++  ret i32 %add
++}
++
++define i32 @bar(i32 %a) #0 {
++entry:
++  %0 = tail call i32 @simpleFunction(i32 6)
++  ret i32 %0
++}
++
++attributes #0 = { nounwind readnone uwtable }
++attributes #1 = { nounwind cold readnone uwtable }
++
++; Check if code regions are properly generated as tuning opportunities.
++; CALLSITE: --- !AutoTuning
++; CALLSITE-NEXT: Pass:            inline
++; CALLSITE-NEXT: Name:            simpleFunction
++; CALLSITE-NEXT: Function:        bar
++; CALLSITE-NEXT: CodeRegionType:  callsite
++; CALLSITE-NEXT: CodeRegionHash:  {{[0-9]+}}
++; CALLSITE-NEXT: DynamicConfigs:  { ForceInline: [ 0, 1 ] }
++; CALLSITE-NEXT: BaselineConfig:  { ForceInline: '1' }
++; CALLSITE-NEXT: Invocation:      0
++; CALLSITE-NEXT: ...
++
++; Check if external functions are filtered out.
++; EXTERNAL-NOT: Name:        extern
+diff --git a/llvm/test/AutoTuning/LoopUnroll/Inputs/debug_loc_template.yaml b/llvm/test/AutoTuning/LoopUnroll/Inputs/debug_loc_template.yaml
+new file mode 100644
+index 000000000000..6dc49a1f7dc2
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/Inputs/debug_loc_template.yaml
+@@ -0,0 +1,10 @@
++--- !AutoTuning
++Pass:            loop-unroll
++Name:            for.cond
++DebugLoc:        { File: loop-opp.c, Line: 4, Column: 5 }
++Function:        foo
++CodeRegionType:  loop
++Args:
++  - UnrollCount: [number]
++Invocation:      0
++...
+diff --git a/llvm/test/AutoTuning/LoopUnroll/Inputs/loop_nest.yaml b/llvm/test/AutoTuning/LoopUnroll/Inputs/loop_nest.yaml
+new file mode 100644
+index 000000000000..4920329dbd4b
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/Inputs/loop_nest.yaml
+@@ -0,0 +1,10 @@
++# CodeRegionHash is correct for only first code region only. 
++!AutoTuning {Args: [{UnrollCount: 2}], CodeRegionHash: 8456922293277663707, CodeRegionType: loop,
++  DebugLoc: {Column: 8, File: loop-nest.c, Line: 10}, Function: loop_nest, Invocation: 0,
++  Name: for.body6.us, Pass: loop-unroll}
++--- !AutoTuning {Args: [{UnrollCount: 4}], CodeRegionHash: 8456922293277663707, CodeRegionType: loop,
++  DebugLoc: {Column: 5, File: loop-nest.c, Line: 9}, Function: loop_nest, Invocation: 0,
++  Name: for.cond4.preheader.us, Pass: loop-unroll}
++--- !AutoTuning {Args: [{UnrollCount: 4}], CodeRegionHash: 8456922293277663707, CodeRegionType: loop,
++  DebugLoc: {Column: 3, File: loop-nest.c, Line: 8}, Function: loop_nest, Invocation: 0,
++  Name: for.cond1.preheader, Pass: loop-unroll}
+diff --git a/llvm/test/AutoTuning/LoopUnroll/Inputs/loop_peel.yaml b/llvm/test/AutoTuning/LoopUnroll/Inputs/loop_peel.yaml
+new file mode 100644
+index 000000000000..a90cebbce88f
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/Inputs/loop_peel.yaml
+@@ -0,0 +1,9 @@
++--- !AutoTuning
++Pass:            loop-unroll
++Name:            loop
++Function:        invariant_backedge_1
++CodeRegionType:  loop
++Args:
++ - UnrollCount: [number]
++Invocation:      0
++...
+diff --git a/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_raw_template.yaml b/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_raw_template.yaml
+new file mode 100644
+index 000000000000..18681a0e2efe
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_raw_template.yaml
+@@ -0,0 +1,10 @@
++--- !AutoTuning
++Pass:            loop-unroll
++Name:            label %5
++Function:        main
++CodeRegionType:  loop
++CodeRegionHash:  [hash]
++Args:
++- UnrollCount: [number]
++Invocation:      1
++...
+diff --git a/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_template.yaml b/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_template.yaml
+new file mode 100644
+index 000000000000..166f877a232e
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_template.yaml
+@@ -0,0 +1,10 @@
++--- !AutoTuning
++Pass:            loop-unroll
++Name:            [name]
++Function:        foo
++CodeRegionType:  loop
++CodeRegionHash:  [hash]
++Args:
++  - UnrollCount: [number]
++Invocation:      1
++...
+diff --git a/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_template_no_metadata.yaml b/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_template_no_metadata.yaml
+new file mode 100644
+index 000000000000..b626473cf782
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/Inputs/unroll_template_no_metadata.yaml
+@@ -0,0 +1,8 @@
++--- !AutoTuning
++Pass:            loop-unroll
++CodeRegionType:  loop
++CodeRegionHash:  [hash]
++Args:
++  - UnrollCount: [number]
++Invocation:      1
++...
+diff --git a/llvm/test/AutoTuning/LoopUnroll/debug_loc.ll b/llvm/test/AutoTuning/LoopUnroll/debug_loc.ll
+new file mode 100644
+index 000000000000..85dd690d01c5
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/debug_loc.ll
+@@ -0,0 +1,161 @@
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' | \
++; RUN:     FileCheck %s -check-prefix=DISABLE
++
++; RUN: rm %t.unroll_debug_loc0.yaml -rf
++; RUN: sed 's#\[number\]#0#g' %S/Inputs/debug_loc_template.yaml > %t.unroll_debug_loc0.yaml
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%t.unroll_debug_loc0.yaml | \
++; RUN:     FileCheck %s -check-prefix=UNROLL0
++
++; RUN: rm %t.unroll_debug_loc4.yaml -rf
++; RUN: sed 's#\[number\]#4#g' %S/Inputs/debug_loc_template.yaml > %t.unroll_debug_loc4.yaml
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-code-region-matching-hash=false \
++; RUN:     -auto-tuning-input=%t.unroll_debug_loc4.yaml | \
++; RUN:     FileCheck %s -check-prefix=UNROLL4
++
++; RUN: rm %t.unroll4.yaml -rf
++; RUN: sed 's#\[number\]#4#g; s#\[name\]#for.cond#g; s#\[hash\]#11552168367013316892#g;'\
++; RUN:     %S/Inputs/unroll_template.yaml > %t.unroll4.yaml
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-code-region-matching-hash=false \
++; RUN:     -auto-tuning-input=%t.unroll4.yaml | \
++; RUN:     FileCheck %s -check-prefix=UNROLL4-MISMATCH
++
++; UNSUPPORTED: windows
++
++; ModuleID = 'loop-opp.c'
++source_filename = "loop-opp.c"
++target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
++target triple = "x86_64-unknown-linux-gnu"
++
++; Function Attrs: noinline nounwind uwtable
++define i32 @foo(i32* %n) #0 !dbg !6 {
++entry:
++  %n.addr = alloca i32*, align 8
++  %b = alloca i32, align 4
++  %i = alloca i32, align 4
++  store i32* %n, i32** %n.addr, align 8
++  call void @llvm.dbg.declare(metadata i32** %n.addr, metadata !11, metadata !12), !dbg !13
++  call void @llvm.dbg.declare(metadata i32* %b, metadata !14, metadata !12), !dbg !15
++  store i32 0, i32* %b, align 4, !dbg !15
++  call void @llvm.dbg.declare(metadata i32* %i, metadata !16, metadata !12), !dbg !18
++  store i32 0, i32* %i, align 4, !dbg !18
++  br label %for.cond, !dbg !19
++
++for.cond:                                         ; preds = %for.inc, %entry
++  %0 = load i32, i32* %i, align 4, !dbg !20
++  %1 = load i32*, i32** %n.addr, align 8, !dbg !23
++  %2 = load i32, i32* %1, align 4, !dbg !24
++  %cmp = icmp slt i32 %0, %2, !dbg !25
++  br i1 %cmp, label %for.body, label %for.end, !dbg !26
++
++for.body:                                         ; preds = %for.cond
++  %3 = load i32, i32* %b, align 4, !dbg !28
++  %add = add nsw i32 %3, 1, !dbg !30
++  store i32 %add, i32* %b, align 4, !dbg !31
++  br label %for.inc, !dbg !32
++
++for.inc:                                          ; preds = %for.body
++  %4 = load i32, i32* %i, align 4, !dbg !33
++  %inc = add nsw i32 %4, 1, !dbg !33
++  store i32 %inc, i32* %i, align 4, !dbg !33
++  br label %for.cond, !dbg !35, !llvm.loop !36
++
++for.end:                                          ; preds = %for.cond
++  %5 = load i32, i32* %b, align 4, !dbg !39
++  ret i32 %5, !dbg !40
++}
++
++; Function Attrs: nounwind readnone
++declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
++
++attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { nounwind readnone }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4}
++!llvm.ident = !{!5}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "" ,isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
++!1 = !DIFile(filename: "loop-opp.c", directory: "")
++!2 = !{}
++!3 = !{i32 2, !"Dwarf Version", i32 4}
++!4 = !{i32 2, !"Debug Info Version", i32 3}
++!5 = !{!""}
++!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
++!7 = !DISubroutineType(types: !8)
++!8 = !{!9, !10}
++!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
++!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64)
++!11 = !DILocalVariable(name: "n", arg: 1, scope: !6, file: !1, line: 1, type: !10)
++!12 = !DIExpression()
++!13 = !DILocation(line: 1, column: 20, scope: !6)
++!14 = !DILocalVariable(name: "b", scope: !6, file: !1, line: 3, type: !9)
++!15 = !DILocation(line: 3, column: 9, scope: !6)
++!16 = !DILocalVariable(name: "i", scope: !17, file: !1, line: 4, type: !9)
++!17 = distinct !DILexicalBlock(scope: !6, file: !1, line: 4, column: 5)
++!18 = !DILocation(line: 4, column: 14, scope: !17)
++!19 = !DILocation(line: 4, column: 10, scope: !17)
++!20 = !DILocation(line: 4, column: 20, scope: !21)
++!21 = !DILexicalBlockFile(scope: !22, file: !1, discriminator: 1)
++!22 = distinct !DILexicalBlock(scope: !17, file: !1, line: 4, column: 5)
++!23 = !DILocation(line: 4, column: 25, scope: !21)
++!24 = !DILocation(line: 4, column: 24, scope: !21)
++!25 = !DILocation(line: 4, column: 22, scope: !21)
++!26 = !DILocation(line: 4, column: 5, scope: !27)
++!27 = !DILexicalBlockFile(scope: !17, file: !1, discriminator: 1)
++!28 = !DILocation(line: 6, column: 11, scope: !29)
++!29 = distinct !DILexicalBlock(scope: !22, file: !1, line: 5, column: 5)
++!30 = !DILocation(line: 6, column: 12, scope: !29)
++!31 = !DILocation(line: 6, column: 9, scope: !29)
++!32 = !DILocation(line: 7, column: 5, scope: !29)
++!33 = !DILocation(line: 4, column: 28, scope: !34)
++!34 = !DILexicalBlockFile(scope: !22, file: !1, discriminator: 2)
++!35 = !DILocation(line: 4, column: 5, scope: !34)
++!36 = distinct !{!36, !37, !38}
++!37 = !DILocation(line: 4, column: 5, scope: !17)
++!38 = !DILocation(line: 7, column: 5, scope: !17)
++!39 = !DILocation(line: 8, column: 12, scope: !6)
++!40 = !DILocation(line: 8, column: 5, scope: !6)
++
++; Auto-tuning-enabled loop unrolling - check that the loop is not unrolled when the auto-tuning feature is disabled when
++; the input remark contains DebugLoc info.
++;
++; DISABLE-LABEL: @foo(
++; DISABLE: for.cond
++; DISABLE: for.body
++; DISABLE-NOT: for.body.1
++; DISABLE: for.inc
++; DISABLE-NOT: llvm.loop.unroll.disable
++
++; Auto-tuning-enabled loop unrolling - check that the loop is not unrolled
++; when unroll count explicitly set to be 0.
++;
++; UNROLL0-LABEL: @foo(
++; UNROLL0: for.cond
++; UNROLL0: for.body
++; UNROLL0-NOT: for.body.1
++; UNROLL0: for.inc
++; UNROLL0-NOT: llvm.loop.unroll.disable
++
++; Auto-tuning-enabled loop unrolling - check that we can unroll the loop by 4
++; when explicitly requested.
++;
++; UNROLL4-LABEL: @foo(
++; UNROLL4: for.cond
++; UNROLL4: for.body
++; UNROLL4: for.body.1
++; UNROLL4: for.body.2
++; UNROLL4: for.body.3
++; UNROLL4: llvm.loop.unroll.disable
++
++; Auto-tuning-enabled loop unrolling - check that the loop is not unrolled
++; when DebugLoc is missing in the input remark.
++;
++; UNROLL4-MISMATCH-LABEL: @foo(
++; UNROLL4-MISMATCH: for.cond
++; UNROLL4-MISMATCH: for.body
++; UNROLL4-MISMATCH-NOT: for.body.1
++; UNROLL4-MISMATCH: for.inc
++; UNROLL4-MISMATCH-NOT: llvm.loop.unroll.disable
+diff --git a/llvm/test/AutoTuning/LoopUnroll/dynamic_config.ll b/llvm/test/AutoTuning/LoopUnroll/dynamic_config.ll
+new file mode 100644
+index 000000000000..414c6ff2d1b0
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/dynamic_config.ll
+@@ -0,0 +1,56 @@
++; RUN: rm %t.default_opp -rf
++; RUN: opt  %s -S -auto-tuning-opp=%t.default_opp -auto-tuning-type-filter=Loop \
++; RUN:     -passes='require<opt-remark-emit>,loop(loop-unroll-full)' --disable-output
++; RUN: FileCheck  %s  --input-file %t.default_opp/dynamic_config.ll.yaml
++
++; Function Attrs: nofree norecurse nounwind uwtable
++define dso_local void @transform(i64* nocapture %W) local_unnamed_addr{
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %entry, %for.body
++  %i.037 = phi i32 [ 16, %entry ], [ %inc, %for.body ]
++  %sub = add nsw i32 %i.037, -3
++  %idxprom = sext i32 %sub to i64
++  %arrayidx = getelementptr inbounds i64, i64* %W, i64 %idxprom
++  %0 = load i64, i64* %arrayidx, align 8
++  %sub1 = add nsw i32 %i.037, -6
++  %idxprom2 = sext i32 %sub1 to i64
++  %arrayidx3 = getelementptr inbounds i64, i64* %W, i64 %idxprom2
++  %1 = load i64, i64* %arrayidx3, align 8
++  %xor = xor i64 %1, %0
++  %idxprom4 = zext i32 %i.037 to i64
++  %arrayidx5 = getelementptr inbounds i64, i64* %W, i64 %idxprom4
++  store i64 %xor, i64* %arrayidx5, align 8
++  %inc = add nuw nsw i32 %i.037, 1
++  %cmp = icmp ult i32 %i.037, 79
++  br i1 %cmp, label %for.body, label %for.body8.preheader
++
++for.body8.preheader:                              ; preds = %for.body
++  br label %for.body8
++
++for.body8:                                        ; preds = %for.body8.preheader, %for.body8
++  %indvars.iv = phi i64 [ 80, %for.body8.preheader ], [ %indvars.iv.next, %for.body8 ]
++  %2 = add nsw i64 %indvars.iv, -4
++  %arrayidx11 = getelementptr inbounds i64, i64* %W, i64 %2
++  %3 = load i64, i64* %arrayidx11, align 8
++  %4 = add nsw i64 %indvars.iv, -5
++  %arrayidx14 = getelementptr inbounds i64, i64* %W, i64 %4
++  %5 = load i64, i64* %arrayidx14, align 8
++  %xor15 = xor i64 %5, %3
++  %arrayidx17 = getelementptr inbounds i64, i64* %W, i64 %indvars.iv
++  store i64 %xor15, i64* %arrayidx17, align 8
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp ne i64 %indvars.iv.next, 256
++  br i1 %exitcond, label %for.body8, label %for.end20
++
++for.end20:                                        ; preds = %for.body8
++  ret void
++}
++
++; CHECK: --- !AutoTuning
++; CHECK: DynamicConfigs:  { UnrollCount: [ 0, 1, 64, 16, 32 ]
++; CHECK: ...
++; CHECK-NEXT: --- !AutoTuning
++; CHECK: DynamicConfigs:  { UnrollCount: [ 0, 1, 64, 16, 32 ]
++; CHECK: ...
+diff --git a/llvm/test/AutoTuning/LoopUnroll/loop_nest.ll b/llvm/test/AutoTuning/LoopUnroll/loop_nest.ll
+new file mode 100644
+index 000000000000..7f3e27ca057a
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/loop_nest.ll
+@@ -0,0 +1,136 @@
++; REQUIRES: asserts
++; CodeRegionHash matches for the first code region only. AutoTuner will find
++; match for one code region when hash matching is enabled. AutoTuner will find
++; match for all three code regions when hash matching is disabl3ed.
++ 
++; RUN: rm -rf %t.loop_nest.txt
++; RUN: opt %s -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -debug-only=autotuning -auto-tuning-input=%S/Inputs/loop_nest.yaml \
++; RUN:     --disable-output &> %t.loop_nest.txt
++; RUN: grep 'UnrollCount is set' %t.loop_nest.txt | wc -l | \
++; RUN:     FileCheck %s -check-prefix=HASH_MATCHING_ENABLED
++
++; RUN: rm -rf %t.loop_nest.txt
++; RUN: opt %s -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-input=%S/Inputs/loop_nest.yaml -debug-only=autotuning \
++; RUN:     -auto-tuning-code-region-matching-hash=false --disable-output &> %t.loop_nest.txt
++; RUN: grep 'UnrollCount is set' %t.loop_nest.txt | wc -l | \
++; RUN:     FileCheck %s -check-prefix=HASH_MATCHING_DISABLED
++
++; ModuleID = 'loop-nest.c'
++source_filename = "loop-nest.c"
++target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
++target triple = "aarch64-unknown-linux-gnu"
++
++; Function Attrs: nofree norecurse nounwind uwtable
++define dso_local void @loop_nest(i32 %ni, i32 %nj, i32 %nk, i32 %alpha, i32 %beta, i32** nocapture readonly %A, i32** nocapture readonly %B, i32** nocapture readonly %C) local_unnamed_addr #0 !dbg !10 {
++entry:
++  %cmp41 = icmp sgt i32 %ni, 0, !dbg !12
++  br i1 %cmp41, label %for.cond1.preheader.lr.ph, label %for.end23, !dbg !13
++
++for.cond1.preheader.lr.ph:                        ; preds = %entry
++  %cmp238 = icmp slt i32 %nk, 1
++  %cmp536 = icmp slt i32 %nj, 1
++  %wide.trip.count51 = zext i32 %ni to i64, !dbg !12
++  %wide.trip.count47 = zext i32 %nk to i64
++  %wide.trip.count = zext i32 %nj to i64
++  %brmerge = or i1 %cmp238, %cmp536
++  br label %for.cond1.preheader, !dbg !13
++
++for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.inc21
++  %indvars.iv49 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next50, %for.inc21 ]
++  br i1 %brmerge, label %for.inc21, label %for.cond4.preheader.us.preheader, !dbg !14
++
++for.cond4.preheader.us.preheader:                 ; preds = %for.cond1.preheader
++  %arrayidx15 = getelementptr inbounds i32*, i32** %C, i64 %indvars.iv49
++  %arrayidx = getelementptr inbounds i32*, i32** %A, i64 %indvars.iv49
++  %.pre = load i32*, i32** %arrayidx, align 8, !tbaa !15
++  %.pre53 = load i32*, i32** %arrayidx15, align 8, !tbaa !15
++  br label %for.cond4.preheader.us, !dbg !14
++
++for.cond4.preheader.us:                           ; preds = %for.cond4.preheader.us.preheader, %for.cond4.for.inc18_crit_edge.us
++  %indvars.iv45 = phi i64 [ 0, %for.cond4.preheader.us.preheader ], [ %indvars.iv.next46, %for.cond4.for.inc18_crit_edge.us ]
++  %arrayidx8.us = getelementptr inbounds i32, i32* %.pre, i64 %indvars.iv45
++  %arrayidx10.us = getelementptr inbounds i32*, i32** %B, i64 %indvars.iv45
++  %0 = load i32*, i32** %arrayidx10.us, align 8, !tbaa !15
++  br label %for.body6.us, !dbg !19
++
++for.body6.us:                                     ; preds = %for.cond4.preheader.us, %for.body6.us
++  %indvars.iv = phi i64 [ 0, %for.cond4.preheader.us ], [ %indvars.iv.next, %for.body6.us ]
++  %1 = load i32, i32* %arrayidx8.us, align 4, !dbg !20, !tbaa !21
++  %mul.us = mul nsw i32 %1, %alpha, !dbg !23
++  %arrayidx12.us = getelementptr inbounds i32, i32* %0, i64 %indvars.iv, !dbg !24
++  %2 = load i32, i32* %arrayidx12.us, align 4, !dbg !24, !tbaa !21
++  %mul13.us = mul nsw i32 %mul.us, %2, !dbg !25
++  %arrayidx17.us = getelementptr inbounds i32, i32* %.pre53, i64 %indvars.iv, !dbg !26
++  %3 = load i32, i32* %arrayidx17.us, align 4, !dbg !27, !tbaa !21
++  %add.us = add nsw i32 %3, %mul13.us, !dbg !27
++  store i32 %add.us, i32* %arrayidx17.us, align 4, !dbg !27, !tbaa !21
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !28
++  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count, !dbg !29
++  br i1 %exitcond.not, label %for.cond4.for.inc18_crit_edge.us, label %for.body6.us, !dbg !19, !llvm.loop !30
++
++for.cond4.for.inc18_crit_edge.us:                 ; preds = %for.body6.us
++  %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1, !dbg !33
++  %exitcond48.not = icmp eq i64 %indvars.iv.next46, %wide.trip.count47, !dbg !34
++  br i1 %exitcond48.not, label %for.inc21, label %for.cond4.preheader.us, !dbg !14, !llvm.loop !35
++
++for.inc21:                                        ; preds = %for.cond4.for.inc18_crit_edge.us, %for.cond1.preheader
++  %indvars.iv.next50 = add nuw nsw i64 %indvars.iv49, 1, !dbg !37
++  %exitcond52.not = icmp eq i64 %indvars.iv.next50, %wide.trip.count51, !dbg !12
++  br i1 %exitcond52.not, label %for.end23, label %for.cond1.preheader, !dbg !13, !llvm.loop !38
++
++for.end23:                                        ; preds = %for.inc21, %entry
++  ret void, !dbg !40
++}
++
++attributes #0 = { nofree norecurse nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4, !5, !6, !7, !8}
++!llvm.ident = !{!9}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Huawei BiSheng Compiler clang version 12.0.0 (clang-a279e099a09a flang-9a86b70390a7)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
++!1 = !DIFile(filename: "loop-nest.c", directory: "/home/m00629332/code/autoTuner")
++!2 = !{}
++!3 = !{i32 2, !"Debug Info Version", i32 3}
++!4 = !{i32 1, !"wchar_size", i32 4}
++!5 = !{i32 1, !"branch-target-enforcement", i32 0}
++!6 = !{i32 1, !"sign-return-address", i32 0}
++!7 = !{i32 1, !"sign-return-address-all", i32 0}
++!8 = !{i32 1, !"sign-return-address-with-bkey", i32 0}
++!9 = !{!"Huawei BiSheng Compiler clang version 12.0.0 (clang-a279e099a09a flang-9a86b70390a7)"}
++!10 = distinct !DISubprogram(name: "loop_nest", scope: !1, file: !1, line: 1, type: !11, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!11 = !DISubroutineType(types: !2)
++!12 = !DILocation(line: 8, column: 17, scope: !10)
++!13 = !DILocation(line: 8, column: 3, scope: !10)
++!14 = !DILocation(line: 9, column: 5, scope: !10)
++!15 = !{!16, !16, i64 0}
++!16 = !{!"any pointer", !17, i64 0}
++!17 = !{!"omnipotent char", !18, i64 0}
++!18 = !{!"Simple C/C++ TBAA"}
++!19 = !DILocation(line: 10, column: 8, scope: !10)
++!20 = !DILocation(line: 11, column: 23, scope: !10)
++!21 = !{!22, !22, i64 0}
++!22 = !{!"int", !17, i64 0}
++!23 = !DILocation(line: 11, column: 21, scope: !10)
++!24 = !DILocation(line: 11, column: 33, scope: !10)
++!25 = !DILocation(line: 11, column: 31, scope: !10)
++!26 = !DILocation(line: 11, column: 4, scope: !10)
++!27 = !DILocation(line: 11, column: 12, scope: !10)
++!28 = !DILocation(line: 10, column: 29, scope: !10)
++!29 = !DILocation(line: 10, column: 22, scope: !10)
++!30 = distinct !{!30, !19, !31, !32}
++!31 = !DILocation(line: 11, column: 39, scope: !10)
++!32 = !{!"llvm.loop.mustprogress"}
++!33 = !DILocation(line: 9, column: 26, scope: !10)
++!34 = !DILocation(line: 9, column: 19, scope: !10)
++!35 = distinct !{!35, !14, !36, !32}
++!36 = !DILocation(line: 12, column: 5, scope: !10)
++!37 = !DILocation(line: 8, column: 24, scope: !10)
++!38 = distinct !{!38, !13, !39, !32}
++!39 = !DILocation(line: 13, column: 3, scope: !10)
++!40 = !DILocation(line: 15, column: 1, scope: !10)
++
++; HASH_MATCHING_ENABLED: 1
++; HASH_MATCHING_DISABLED: 3
+diff --git a/llvm/test/AutoTuning/LoopUnroll/loop_peel.ll b/llvm/test/AutoTuning/LoopUnroll/loop_peel.ll
+new file mode 100644
+index 000000000000..f3839a49b20e
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/loop_peel.ll
+@@ -0,0 +1,53 @@
++; NOTE: This file is used to test when UnrollCount = 1 and when the compiler
++; sees that Loop Peeling is beneficial and possible, then we do Loop Peeling.
++; RUN: rm %t.unroll1.yaml -rf
++; RUN: sed 's#\[number\]#1#g;' %S/Inputs/loop_peel.yaml > %t.unroll1.yaml
++; RUN: opt  %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-code-region-matching-hash=false \
++; RUN:     -auto-tuning-input=%t.unroll1.yaml | FileCheck %s
++
++; RUN: rm %t.unroll0.yaml -rf
++; RUN: sed 's#\[number\]#0#g;' %S/Inputs/loop_peel.yaml > %t.unroll0.yaml
++; RUN: opt  %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-code-region-matching-hash=false \
++; RUN:     -auto-tuning-input=%t.unroll0.yaml | FileCheck %s --check-prefix=DISABLE
++
++; RUN: opt  %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-code-region-matching-hash=false \
++; RUN:     -auto-tuning-opp=%t.unroll_opp -auto-tuning-type-filter=Loop --disable-output
++; RUN: FileCheck %s --input-file %t.unroll_opp/loop_peel.ll.yaml -check-prefix=TEST-1
++
++define i32 @invariant_backedge_1(i32 %a, i32 %b) {
++; CHECK-LABEL: @invariant_backedge_1
++; CHECK-NOT:     %plus = phi
++; CHECK:       loop.peel:
++; CHECK:       loop:
++; CHECK:         %i = phi
++; CHECK:         %sum = phi
++; DISABLE-LABEL: @invariant_backedge_1
++; DISABLE-NOT: loop.peel:
++entry:
++  br label %loop
++
++loop:
++  %i = phi i32 [ 0, %entry ], [ %inc, %loop ]
++  %sum = phi i32 [ 0, %entry ], [ %incsum, %loop ]
++  %plus = phi i32 [ %a, %entry ], [ %b, %loop ]
++
++  %incsum = add i32 %sum, %plus
++  %inc = add i32 %i, 1
++  %cmp = icmp slt i32 %i, 1000
++
++  br i1 %cmp, label %loop, label %exit
++
++exit:
++  ret i32 %sum
++}
++
++; Check for dynamic values when UnrollCount is set to 1:
++; TEST-1:      Pass:                loop-unroll
++; TEST-1-NEXT: Name:                loop
++; TEST-1-NEXT: Function:            invariant_backedge_1
++; TEST-1-NEXT: CodeRegionType:      loop
++; TEST-1-NEXT: CodeRegionHash:      {{[0-9]+}}
++; TEST-1-NEXT: DynamicConfigs:      { UnrollCount: [ 0, 1, 2 ] }
+diff --git a/llvm/test/AutoTuning/LoopUnroll/unroll-pragma.ll b/llvm/test/AutoTuning/LoopUnroll/unroll-pragma.ll
+new file mode 100644
+index 000000000000..843b8e28f3d8
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/unroll-pragma.ll
+@@ -0,0 +1,129 @@
++; RUN: rm %t.unroll_opp -rf
++; RUN: opt  %s -S -auto-tuning-opp=%t.unroll_opp -auto-tuning-type-filter=Loop \
++; RUN:     -passes='require<opt-remark-emit>,loop(loop-unroll-full)' --disable-output
++; RUN: FileCheck %s --input-file %t.unroll_opp/unroll-pragma.ll.yaml -check-prefix=TEST-1
++; RUN: FileCheck %s --input-file %t.unroll_opp/unroll-pragma.ll.yaml -check-prefix=TEST-2
++
++; RUN: rm %t.unroll_opp -rf
++; RUN: opt  %s -S -auto-tuning-opp=%t.unroll_opp -auto-tuning-type-filter=Loop \
++; RUN:     -passes='require<opt-remark-emit>,function(loop-unroll)' --disable-output
++; RUN: FileCheck %s --input-file %t.unroll_opp/unroll-pragma.ll.yaml -check-prefix=TEST-1
++; RUN: FileCheck %s --input-file %t.unroll_opp/unroll-pragma.ll.yaml -check-prefix=TEST-2
++
++; This function contains two loops. loop for.body is defined with a pragma
++; unroll_count(4) and loop for.body9 is without a pragama. AutoTuner will only
++; consider for.body9 as a tuning opportunity.
++
++; ModuleID = 'loop-unroll.c'
++source_filename = "loop-unroll.c"
++target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
++target triple = "aarch64-unknown-linux-gnu"
++
++; Function Attrs: nofree norecurse nounwind uwtable
++define dso_local void @loop(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32* noalias nocapture %d, i32 %len) local_unnamed_addr #0 !dbg !10 {
++entry:
++  %cmp34 = icmp slt i32 0, %len, !dbg !12
++  br i1 %cmp34, label %for.body.lr.ph, label %for.cond6.preheader, !dbg !13
++
++for.body.lr.ph:                                   ; preds = %entry
++  br label %for.body, !dbg !13
++
++for.cond.for.cond6.preheader_crit_edge:           ; preds = %for.body
++  br label %for.cond6.preheader, !dbg !13
++
++for.cond6.preheader:                              ; preds = %for.cond.for.cond6.preheader_crit_edge, %entry
++  %cmp732 = icmp slt i32 0, %len, !dbg !14
++  br i1 %cmp732, label %for.body9.lr.ph, label %for.cond.cleanup8, !dbg !15
++
++for.body9.lr.ph:                                  ; preds = %for.cond6.preheader
++  br label %for.body9, !dbg !15
++
++for.body:                                         ; preds = %for.body.lr.ph, %for.body
++  %i.035 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
++  %idxprom = zext i32 %i.035 to i64, !dbg !16
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom, !dbg !16
++  %0 = load i32, i32* %arrayidx, align 4, !dbg !16, !tbaa !17
++  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %idxprom, !dbg !21
++  %1 = load i32, i32* %arrayidx2, align 4, !dbg !21, !tbaa !17
++  %add = add nsw i32 %1, %0, !dbg !22
++  %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 %idxprom, !dbg !23
++  store i32 %add, i32* %arrayidx4, align 4, !dbg !24, !tbaa !17
++  %inc = add nuw nsw i32 %i.035, 1, !dbg !25
++  %cmp = icmp slt i32 %inc, %len, !dbg !12
++  br i1 %cmp, label %for.body, label %for.cond.for.cond6.preheader_crit_edge, !dbg !13, !llvm.loop !26
++
++for.cond6.for.cond.cleanup8_crit_edge:            ; preds = %for.body9
++  br label %for.cond.cleanup8, !dbg !15
++
++for.cond.cleanup8:                                ; preds = %for.cond6.for.cond.cleanup8_crit_edge, %for.cond6.preheader
++  ret void, !dbg !30
++
++for.body9:                                        ; preds = %for.body9.lr.ph, %for.body9
++  %i5.033 = phi i32 [ 0, %for.body9.lr.ph ], [ %inc17, %for.body9 ]
++  %idxprom10 = zext i32 %i5.033 to i64, !dbg !31
++  %arrayidx11 = getelementptr inbounds i32, i32* %a, i64 %idxprom10, !dbg !31
++  %2 = load i32, i32* %arrayidx11, align 4, !dbg !31, !tbaa !17
++  %arrayidx13 = getelementptr inbounds i32, i32* %b, i64 %idxprom10, !dbg !32
++  %3 = load i32, i32* %arrayidx13, align 4, !dbg !32, !tbaa !17
++  %mul = mul nsw i32 %3, %2, !dbg !33
++  %arrayidx15 = getelementptr inbounds i32, i32* %d, i64 %idxprom10, !dbg !34
++  store i32 %mul, i32* %arrayidx15, align 4, !dbg !35, !tbaa !17
++  %inc17 = add nuw nsw i32 %i5.033, 1, !dbg !36
++  %cmp7 = icmp slt i32 %inc17, %len, !dbg !14
++  br i1 %cmp7, label %for.body9, label %for.cond6.for.cond.cleanup8_crit_edge, !dbg !15, !llvm.loop !37
++}
++
++attributes #0 = { nofree norecurse nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4, !5, !6, !7, !8}
++!llvm.ident = !{!9}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Huawei Bisheng Compiler clang version 12.0.0 (0261bbf0b2fd)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
++!1 = !DIFile(filename: "loop-unroll.c", directory: "/home/AutoTuner/")
++!2 = !{}
++!3 = !{i32 2, !"Debug Info Version", i32 3}
++!4 = !{i32 1, !"wchar_size", i32 4}
++!5 = !{i32 1, !"branch-target-enforcement", i32 0}
++!6 = !{i32 1, !"sign-return-address", i32 0}
++!7 = !{i32 1, !"sign-return-address-all", i32 0}
++!8 = !{i32 1, !"sign-return-address-with-bkey", i32 0}
++!9 = !{!"Huawei Bisheng Compiler clang version 12.0.0 (0261bbf0b2fd)"}
++!10 = distinct !DISubprogram(name: "a", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
++!11 = !DISubroutineType(types: !2)
++!12 = !DILocation(line: 3, column: 20, scope: !10)
++!13 = !DILocation(line: 3, column: 5, scope: !10)
++!14 = !DILocation(line: 7, column: 20, scope: !10)
++!15 = !DILocation(line: 7, column: 5, scope: !10)
++!16 = !DILocation(line: 4, column: 16, scope: !10)
++!17 = !{!18, !18, i64 0}
++!18 = !{!"int", !19, i64 0}
++!19 = !{!"omnipotent char", !20, i64 0}
++!20 = !{!"Simple C/C++ TBAA"}
++!21 = !DILocation(line: 4, column: 23, scope: !10)
++!22 = !DILocation(line: 4, column: 21, scope: !10)
++!23 = !DILocation(line: 4, column: 9, scope: !10)
++!24 = !DILocation(line: 4, column: 14, scope: !10)
++!25 = !DILocation(line: 3, column: 28, scope: !10)
++!26 = distinct !{!26, !13, !27, !28, !29}
++!27 = !DILocation(line: 5, column: 5, scope: !10)
++!28 = !{!"llvm.loop.mustprogress"}
++!29 = !{!"llvm.loop.unroll.count", i32 4}
++!30 = !DILocation(line: 10, column: 1, scope: !10)
++!31 = !DILocation(line: 8, column: 16, scope: !10)
++!32 = !DILocation(line: 8, column: 23, scope: !10)
++!33 = !DILocation(line: 8, column: 21, scope: !10)
++!34 = !DILocation(line: 8, column: 9, scope: !10)
++!35 = !DILocation(line: 8, column: 14, scope: !10)
++!36 = !DILocation(line: 7, column: 28, scope: !10)
++!37 = distinct !{!37, !15, !38, !28}
++!38 = !DILocation(line: 9, column: 5, scope: !10)
++
++
++; TEST-1: Pass:            loop-unroll
++; TEST-1-NOT: Pass:            loop-unroll
++
++; TEST-2: Name:            for.body9
++; TEST-2-NEXT: DebugLoc:        { File: loop-unroll.c, Line: 7, Column: 5 }
++; TEST-2-NEXT: Function:        loop
++; TEST-2-NEXT: CodeRegionType:  loop
+diff --git a/llvm/test/AutoTuning/LoopUnroll/unroll.ll b/llvm/test/AutoTuning/LoopUnroll/unroll.ll
+new file mode 100644
+index 000000000000..ba5c89fffaff
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/unroll.ll
+@@ -0,0 +1,101 @@
++; RUN: opt %s -S -passes=loop-unroll | FileCheck %s -check-prefix=DISABLE
++
++; RUN: rm %t.unroll0.yaml -rf
++; RUN: sed 's#\[number\]#0#g; s#\[name\]#for.body#g; s#\[hash\]#14791762861362113823#g' \
++; RUN:     %S/Inputs/unroll_template.yaml > %t.unroll0.yaml
++; RUN: opt %s -S -passes=loop-unroll -auto-tuning-input=%t.unroll0.yaml \
++; RUN:     -auto-tuning-code-region-matching-hash=false | \
++; RUN:     FileCheck %s -check-prefix=UNROLL0
++
++; RUN: rm %t.unroll0.yaml -rf
++; RUN: sed 's#\[number\]#0#g; s#\[hash\]#14791762861362113823#g' \
++; RUN:     %S/Inputs/unroll_template_no_metadata.yaml > %t.unroll0.yaml
++; RUN: opt %s -S -passes=loop-unroll -auto-tuning-input=%t.unroll0.yaml \
++; RUN:     -auto-tuning-omit-metadata | \
++; RUN:     FileCheck %s -check-prefix=UNROLL0
++
++; RUN: rm %t.result1 %t.unroll1.yaml -rf
++; RUN: sed 's#\[number\]#1#g; s#\[name\]#for.body#g; s#\[hash\]#14791762861362113823#g' \
++; RUN:     %S/Inputs/unroll_template.yaml > %t.unroll1.yaml
++; RUN: opt %s -S -passes=loop-unroll -auto-tuning-input=%t.unroll1.yaml | \
++; RUN:     FileCheck %s -check-prefix=UNROLL1
++
++; RUN: rm %t.result1 %t.unroll1.yaml -rf
++; RUN: sed 's#\[number\]#1#g; s#\[hash\]#14791762861362113823#g' \
++; RUN:     %S/Inputs/unroll_template_no_metadata.yaml > %t.unroll1.yaml
++; RUN: opt %s -S -passes=loop-unroll -auto-tuning-input=%t.unroll1.yaml \
++; RUN:     -auto-tuning-omit-metadata | \
++; RUN:     FileCheck %s -check-prefix=UNROLL1
++
++; RUN: rm %t.result4 %t.unroll4.yaml -rf
++; RUN: sed 's#\[number\]#4#g; s#\[name\]#for.body#g; s#\[hash\]#14791762861362113823#g' \
++; RUN:     %S/Inputs/unroll_template.yaml > %t.unroll4.yaml
++; RUN: opt %s -S -passes=loop-unroll -auto-tuning-input=%t.unroll4.yaml | \
++; RUN:     FileCheck %s -check-prefix=UNROLL4
++
++; RUN: rm %t.result4 %t.unroll4.yaml -rf
++; RUN: sed 's#\[number\]#4#g; s#\[hash\]#14791762861362113823#g' \
++; RUN:     %S/Inputs/unroll_template_no_metadata.yaml > %t.unroll4.yaml
++; RUN: opt %s -S -passes=loop-unroll -auto-tuning-input=%t.unroll4.yaml \
++; RUN:     -auto-tuning-omit-metadata | \
++; RUN:     FileCheck %s -check-prefix=UNROLL4
++
++; UNSUPPORTED: windows
++
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++
++for.end:                                          ; preds = %for.body
++  ret void
++}
++
++; Auto-tuning-enabled loop unrolling - check that the loop is not unrolled when the auto-tuning feature is disabled
++;
++; DISABLE-LABEL: @foo(
++; DISABLE: store i32
++; DISABLE-NOT: store i32
++; DISABLE: br i1
++; DISABLE-NOT: llvm.loop.unroll.disable
++
++
++; Auto-tuning-enabled loop unrolling - check that the loop is not unrolled
++; when unroll count explicitly set to be 0.
++;
++; UNROLL0-LABEL: @foo(
++; UNROLL0: store i32
++; UNROLL0-NOT: store i32
++; UNROLL0: br i1
++; UNROLL0-NOT: llvm.loop.unroll.disable
++
++
++; Auto-tuning-enabled loop unrolling - Requesting UnrollCount = 1 will perform
++; Loop Peeling, and if Loop Peeling isn't possible/beneficial then Unroll Count
++; is unchanged.
++;
++; UNROLL1-LABEL: @foo(
++; UNROLL1: store i32
++; UNROLL1-NOT: store i32
++; UNROLL1: br i1
++; UNROLL1: llvm.loop.unroll.disable
++
++; Auto-tuning-enabled loop unrolling - check that we can unroll the loop by 4
++; when explicitly requested.
++;
++; UNROLL4-LABEL: @foo(
++; UNROLL4: store i32
++; UNROLL4: store i32
++; UNROLL4: store i32
++; UNROLL4: store i32
++; UNROLL4: br i1
++; UNROLL4: llvm.loop.unroll.disable
+diff --git a/llvm/test/AutoTuning/LoopUnroll/unroll_raw.ll b/llvm/test/AutoTuning/LoopUnroll/unroll_raw.ll
+new file mode 100644
+index 000000000000..480ccad640ae
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopUnroll/unroll_raw.ll
+@@ -0,0 +1,113 @@
++; Test loop unrolling using auto-tuning YAML api with IRs generated when ASSERTION=OFF
++; The IRs generated when ASSERTION=OFF usually only use slot numbers as variable names.
++
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' | \
++; RUN:     FileCheck %s -check-prefix=DISABLE
++
++; RUN: rm %t.result1_raw %t.unroll1_raw.yaml -rf
++; RUN: sed 's#\[number\]#1#g; s#\[hash\]#18159364858606519094#g' \
++; RUN:     %S/Inputs/unroll_raw_template.yaml > %t.unroll1_raw.yaml
++; RUN: opt %s -S -passes='require<opt-remark-emit>,function(loop-unroll)' \
++; RUN:     -auto-tuning-input=%t.unroll1_raw.yaml | FileCheck %s -check-prefix=UNROLL1
++
++; RUN: rm %t.result2_raw %t.unroll2_raw.yaml -rf
++; RUN: sed 's#\[number\]#2#g; s#\[hash\]#18159364858606519094#g' \
++; RUN:     %S/Inputs/unroll_raw_template.yaml > %t.unroll2_raw.yaml
++; RUN: opt %s -S -passes='require<opt-remark-emit>,function(loop-unroll)' \
++; RUN:     -auto-tuning-input=%t.unroll2_raw.yaml | FileCheck %s -check-prefix=UNROLL2
++
++; RUN: rm %t.result4_raw %t.unroll4_raw.yaml -rf
++; RUN: sed 's#\[number\]#4#g; s#\[hash\]#18159364858606519094#g' \
++; RUN:     %S/Inputs/unroll_raw_template.yaml > %t.unroll4_raw.yaml
++; RUN: opt %s -S -passes='require<opt-remark-emit>,function(loop-unroll)' \
++; RUN:     -auto-tuning-input=%t.unroll4_raw.yaml | FileCheck %s -check-prefix=UNROLL4
++
++; UNSUPPORTED: windows
++
++; ModuleID = 't.ll'
++source_filename = "t.ll"
++
++@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
++
++define void @test(i32*) {
++  %2 = alloca i32*, align 8
++  store i32* %0, i32** %2, align 8
++  %3 = load i32*, i32** %2, align 8
++  %4 = load i32, i32* %3, align 4
++  %5 = add nsw i32 %4, 2
++  %6 = load i32*, i32** %2, align 8
++  store i32 %5, i32* %6, align 4
++  ret void
++}
++
++define i32 @main() {
++  %1 = alloca i32, align 4
++  %2 = alloca i32, align 4
++  store i32 0, i32* %1, align 4
++  store i32 8, i32* %2, align 4
++  %3 = load i32, i32* %2, align 4
++  %4 = icmp sle i32 %3, 88
++  br i1 %4, label %.lr.ph, label %13
++
++.lr.ph:                                           ; preds = %0
++  br label %5
++
++; <label>:5:                                      ; preds = %.lr.ph, %8
++  call void @test(i32* %2)
++  %6 = load i32, i32* %2, align 4
++  %7 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %6)
++  br label %8
++
++; <label>:8:                                      ; preds = %5
++  %9 = load i32, i32* %2, align 4
++  %10 = add nsw i32 %9, 8
++  store i32 %10, i32* %2, align 4
++  %11 = load i32, i32* %2, align 4
++  %12 = icmp sle i32 %11, 88
++  br i1 %12, label %5, label %._crit_edge
++
++._crit_edge:                                      ; preds = %8
++  br label %13
++
++; <label>:13:                                     ; preds = %._crit_edge, %0
++  %14 = load i32, i32* %1, align 4
++  ret i32 %14
++}
++
++declare i32 @printf(i8*, ...)
++
++
++; Auto-tuning-enabled loop unrolling - check that the loop is not unrolled when the auto-tuning feature is disabled
++;
++; DISABLE-LABEL: @main(
++; DISABLE: call void @test(ptr %2)
++; DISABLE-NOT: call void @test(ptr %2)
++; DISABLE-NOT: llvm.loop.unroll.disable
++
++
++; Auto-tuning-enabled loop unrolling - check that we can unroll the loop by 1
++; when explicitly requested.
++;
++; UNROLL1-LABEL: @main(
++; UNROLL1: call void @test(ptr %2)
++; UNROLL1-NOT: call void @test(ptr %2)
++
++; Auto-tuning-enabled loop unrolling - check that we can unroll the loop by 2
++; when explicitly requested.
++;
++; UNROLL2-LABEL: @main(
++; UNROLL2: call void @test(ptr %2)
++; UNROLL2: call void @test(ptr %2)
++; UNROLL2-NOT: call void @test(ptr %2)
++; UNROLL2: llvm.loop.unroll.disable
++
++
++; Auto-tuning-enabled loop unrolling - check that we can unroll the loop by 4
++; when explicitly requested.
++;
++; UNROLL4-LABEL: @main(
++; UNROLL4: call void @test(ptr %2)
++; UNROLL4: call void @test(ptr %2)
++; UNROLL4: call void @test(ptr %2)
++; UNROLL4: call void @test(ptr %2)
++; UNROLL4: llvm.loop.unroll.disable
+diff --git a/llvm/test/AutoTuning/LoopVectorize/Inputs/vectorize_template.yaml b/llvm/test/AutoTuning/LoopVectorize/Inputs/vectorize_template.yaml
+new file mode 100644
+index 000000000000..b65fddf4e23f
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopVectorize/Inputs/vectorize_template.yaml
+@@ -0,0 +1,9 @@
++--- !AutoTuning
++Pass:            loop-vectorize
++Name:            bb4
++Function:        TestFoo
++CodeRegionType:  loop
++CodeRegionHash:  14229620333597121971
++Args:
++- VectorizationInterleave: [number]
++...
+diff --git a/llvm/test/AutoTuning/LoopVectorize/Inputs/vectorize_template_no_metadata.yaml b/llvm/test/AutoTuning/LoopVectorize/Inputs/vectorize_template_no_metadata.yaml
+new file mode 100644
+index 000000000000..87d2fc2587cb
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopVectorize/Inputs/vectorize_template_no_metadata.yaml
+@@ -0,0 +1,7 @@
++--- !AutoTuning
++Pass:            loop-vectorize
++CodeRegionType:  loop
++CodeRegionHash:  14229620333597121971
++Args:
++- VectorizationInterleave: [number]
++...
+diff --git a/llvm/test/AutoTuning/LoopVectorize/force-vector-interleave.ll b/llvm/test/AutoTuning/LoopVectorize/force-vector-interleave.ll
+new file mode 100644
+index 000000000000..a1652babd8f4
+--- /dev/null
++++ b/llvm/test/AutoTuning/LoopVectorize/force-vector-interleave.ll
+@@ -0,0 +1,88 @@
++; RUN: rm %t.1 %t.2 %t.1.yaml -rf
++; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=1 -S -o %t.1
++; RUN: sed 's#\[number\]#1#g' %S/Inputs/vectorize_template.yaml > %t.1.yaml
++; RUN: opt %s -passes=loop-vectorize -auto-tuning-input=%t.1.yaml \
++; RUN:     -S -o %t.2 -debug-only=autotuning 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=NUMBER1
++; RUN: diff %t.1 %t.2
++
++; RUN: rm %t.1 %t.2 %t.1.yaml -rf
++; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=1 -S -o %t.1
++; RUN: sed 's#\[number\]#1#g' %S/Inputs/vectorize_template_no_metadata.yaml > %t.1.yaml
++; RUN: opt %s -passes=loop-vectorize -auto-tuning-input=%t.1.yaml \
++; RUN:     -auto-tuning-omit-metadata -S -o %t.2 -debug-only=autotuning 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=NUMBER1
++; RUN: diff %t.1 %t.2
++
++; RUN: rm %t.3 %t.4 %t.2.yaml -rf
++; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=2 -S -o %t.3
++; RUN: sed 's#\[number\]#2#g' %S/Inputs/vectorize_template.yaml > %t.2.yaml
++; RUN: opt %s -passes=loop-vectorize -auto-tuning-input=%t.2.yaml \
++; RUN:     -S -o %t.4 -debug-only=autotuning 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=NUMBER2
++; RUN: diff %t.3 %t.4
++
++; RUN: rm %t.3 %t.4 %t.2.yaml -rf
++; RUN: opt %s -passes=loop-vectorize -force-vector-interleave=2 -S -o %t.3
++; RUN: sed 's#\[number\]#2#g' %S/Inputs/vectorize_template_no_metadata.yaml > %t.2.yaml
++; RUN: opt %s -passes=loop-vectorize -auto-tuning-input=%t.2.yaml \
++; RUN:     -auto-tuning-omit-metadata -S -o %t.4 -debug-only=autotuning 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=NUMBER2
++; RUN: diff %t.3 %t.4
++
++; Compiler should not generate tuning opportunities for AutoTuner if -force-vector-interleave is specified.
++; RUN: rm %t.interleave_opp -rf
++; RUN: opt %s -S -passes=loop-vectorize -auto-tuning-opp=%t.interleave_opp \
++; RUN:     -force-vector-interleave=2 --disable-output
++; RUN: FileCheck %s --input-file %t.interleave_opp/force-vector-interleave.ll.yaml \
++; RUN:     -check-prefix=FORCE-INTERLEAVE
++
++; RUN: rm %t.interleave_opp -rf
++; RUN: opt %s -S -passes=loop-vectorize -auto-tuning-opp=%t.interleave_opp \
++; RUN:     -force-vector-interleave=0 --disable-output
++; RUN: FileCheck %s --input-file %t.interleave_opp/force-vector-interleave.ll.yaml \
++; RUN:     -check-prefix=FORCE-INTERLEAVE
++
++; RUN: rm %t.interleave_opp -rf
++; RUN: opt %s -S -passes=loop-vectorize -auto-tuning-opp=%t.interleave_opp --disable-output
++; RUN: FileCheck %s --input-file %t.interleave_opp/force-vector-interleave.ll.yaml \
++; RUN:     -check-prefix=NO-FORCE-INTERLEAVE
++
++; REQUIRES: asserts
++; UNSUPPORTED: windows
++target datalayout = "e-m:e-i64:64-n32:64"
++target triple = "powerpc64le-unknown-linux-gnu"
++
++define void @TestFoo(i1 %X, i1 %Y) {
++bb:
++  br label %.loopexit5.outer
++
++.loopexit5.outer:
++  br label %.lr.ph12
++
++.loopexit:
++  br i1 %X, label %.loopexit5.outer, label %.lr.ph12
++
++.lr.ph12:
++  %f.110 = phi i32* [ %tmp1, %.loopexit ], [ null, %.loopexit5.outer ]
++  %tmp1 = getelementptr inbounds i32, i32* %f.110, i64 -2
++  br i1 %Y, label %bb4, label %.loopexit
++
++bb4:
++  %j.27 = phi i32 [ 0, %.lr.ph12 ], [ %tmp7, %bb4 ]
++  %tmp5 = load i32, i32* %f.110, align 4
++  %tmp7 = add nsw i32 %j.27, 1
++  %exitcond = icmp eq i32 %tmp7, 0
++  br i1 %exitcond, label %.loopexit, label %bb4
++}
++
++; NUMBER1: VectorizationInterleave is set for the CodeRegion:
++; NUMBER1:   Name: bb4
++; NUMBER1:   FuncName: TestFoo
++; NUMBER2: VectorizationInterleave is set for the CodeRegion:
++; NUMBER2:   Name: bb4
++; NUMBER2:   FuncName: TestFoo
++
++; FORCE-INTERLEAVE-NOT: Pass:           loop-vectorize
++; NO-FORCE-INTERLEAVE: Pass:            loop-vectorize
++; NO-FORCE-INTERLEAVE: BaselineConfig:  { VectorizationInterleave:
+diff --git a/llvm/test/AutoTuning/MachineScheduler/Inputs/misched_x86_template.yaml b/llvm/test/AutoTuning/MachineScheduler/Inputs/misched_x86_template.yaml
+new file mode 100644
+index 000000000000..34ea66e45a0a
+--- /dev/null
++++ b/llvm/test/AutoTuning/MachineScheduler/Inputs/misched_x86_template.yaml
+@@ -0,0 +1,10 @@
++--- !AutoTuning
++Pass:            machine-scheduler
++Name:            '%bb.1:for.cond.preheader'
++Function:        _preextrapolate_helper
++CodeRegionType:  machine_basic_block
++CodeRegionHash:  17389215691512956355
++Args:
++- ForceBottomUp: [bool1]
++- ForceTopDown: [bool2]
++...
+diff --git a/llvm/test/AutoTuning/MachineScheduler/misched_x86_bidirectional.ll b/llvm/test/AutoTuning/MachineScheduler/misched_x86_bidirectional.ll
+new file mode 100644
+index 000000000000..aa4781dad204
+--- /dev/null
++++ b/llvm/test/AutoTuning/MachineScheduler/misched_x86_bidirectional.ll
+@@ -0,0 +1,73 @@
++; RUN: rm %t.bidirectional_result %t.misched_x86_bidirectional.yaml -rf
++; RUN: sed  ' s#\[bool1\]#false#g;  s#\[bool2\]#false#g' %S/Inputs/misched_x86_template.yaml > %t.misched_x86_bidirectional.yaml
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_bidirectional.yaml\
++; RUN:           -verify-machineinstrs -debug-only=machine-scheduler 2>&1 \
++; RUN:     | FileCheck %s
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_bidirectional.yaml\
++; RUN:           -verify-machineinstrs  -misched-topdown -debug-only=machine-scheduler 2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-TOPDOWN
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_bidirectional.yaml\
++; RUN:           -verify-machineinstrs  -misched-bottomup -debug-only=machine-scheduler 2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-BOTTOMUP
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_bidirectional.yaml\
++; RUN:           -verify-machineinstrs  -misched-bottomup=false -misched-topdown=false -debug-only=machine-scheduler 2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-BIDIRECTIONAL
++
++; REQUIRES: asserts
++; UNSUPPORTED: windows
++;
++; Interesting MachineScheduler cases.
++
++declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
++
++define fastcc void @_preextrapolate_helper() nounwind uwtable ssp {
++entry:
++  br i1 undef, label %for.cond.preheader, label %if.end
++
++for.cond.preheader:                               ; preds = %entry
++  call void @llvm.memcpy.p0i8.p0i8.i64(i8* undef, i8* null, i64 128, i32 4, i1 false) nounwind
++  unreachable
++
++if.end:                                           ; preds = %entry
++  ret void
++}
++
++; check if the scheduling policy defined with xml is applied
++;
++; CHECK: _preextrapolate_helper:%bb.1 for.cond.preheader
++; CHECK: ScheduleDAGMILive::schedule starting
++; CHECK-NEXT: OnlyTopDown=0 OnlyBottomUp=0
++
++
++
++; check if the scheduling policies defined with xml and '-misched-topdown' are applied
++; MIX-WITH-FLAG-TOPDOWN: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-TOPDOWN: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-TOPDOWN-NEXT: OnlyTopDown=1 OnlyBottomUp=0
++; MIX-WITH-FLAG-TOPDOWN: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-TOPDOWN: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-TOPDOWN-NEXT: OnlyTopDown=0 OnlyBottomUp=0
++
++; check if the scheduling policies defined with xml and '-misched-bottomup' are applied
++; MIX-WITH-FLAG-BOTTOMUP: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-BOTTOMUP: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BOTTOMUP-NEXT: OnlyTopDown=0 OnlyBottomUp=1
++; MIX-WITH-FLAG-BOTTOMUP: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-BOTTOMUP: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BOTTOMUP-NEXT: OnlyTopDown=0 OnlyBottomUp=0
++
++; check if the scheduling policies defined with xml and '-misched-topdown=false' and '-misched-bottomup=false'
++; are applied
++; MIX-WITH-FLAG-BIDIRECTIONAL: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-BIDIRECTIONAL: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BIDIRECTIONAL-NEXT: OnlyTopDown=0 OnlyBottomUp=0
++; MIX-WITH-FLAG-BIDIRECTIONAL: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-BIDIRECTIONAL: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BIDIRECTIONAL-NEXT: OnlyTopDown=0 OnlyBottomUp=0
+diff --git a/llvm/test/AutoTuning/MachineScheduler/misched_x86_bottomup.ll b/llvm/test/AutoTuning/MachineScheduler/misched_x86_bottomup.ll
+new file mode 100644
+index 000000000000..c1d6894c3fe2
+--- /dev/null
++++ b/llvm/test/AutoTuning/MachineScheduler/misched_x86_bottomup.ll
+@@ -0,0 +1,72 @@
++; RUN: rm %t.bottomup_result %t.misched_x86_bottomup.yaml -rf
++; RUN: sed  ' s#\[bool1\]#true#g;  s#\[bool2\]#false#g' %S/Inputs/misched_x86_template.yaml > %t.misched_x86_bottomup.yaml
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_bottomup.yaml\
++; RUN:           -verify-machineinstrs -debug-only=machine-scheduler  2>&1\
++; RUN:     | FileCheck %s
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_bottomup.yaml\
++; RUN:           -verify-machineinstrs -misched-topdown -debug-only=machine-scheduler 2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-TOPDOWN
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_bottomup.yaml\
++; RUN:           -verify-machineinstrs -misched-bottomup -debug-only=machine-scheduler 2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-BOTTOMUP
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_bottomup.yaml\
++; RUN:           -verify-machineinstrs -misched-bottomup=false -misched-topdown=false -debug-only=machine-scheduler 2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-BIDIRECTIONAL
++
++; REQUIRES: asserts
++; UNSUPPORTED: windows
++;
++; Interesting MachineScheduler cases.
++
++declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
++
++define fastcc void @_preextrapolate_helper() nounwind uwtable ssp {
++entry:
++  br i1 undef, label %for.cond.preheader, label %if.end
++
++for.cond.preheader:                               ; preds = %entry
++  call void @llvm.memcpy.p0i8.p0i8.i64(i8* undef, i8* null, i64 128, i32 4, i1 false) nounwind
++  unreachable
++
++if.end:                                           ; preds = %entry
++  ret void
++}
++
++; check if the scheduling policy defined with xml is applied
++;
++; CHECK: _preextrapolate_helper:%bb.1 for.cond.preheader
++; CHECK: ScheduleDAGMILive::schedule starting
++; CHECK-NEXT: RegionPolicy: ShouldTrackPressure=0 OnlyTopDown=0 OnlyBottomUp=1
++
++
++; check if the scheduling policies defined with xml and '-misched-topdown' are applied
++; MIX-WITH-FLAG-TOPDOWN: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-TOPDOWN: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-TOPDOWN-NEXT: OnlyTopDown=1 OnlyBottomUp=0
++; MIX-WITH-FLAG-TOPDOWN: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-TOPDOWN: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-TOPDOWN-NEXT: OnlyTopDown=0 OnlyBottomUp=1
++
++; check if the scheduling policies defined with xml and '-misched-bottomup' are applied
++; MIX-WITH-FLAG-BOTTOMUP: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-BOTTOMUP: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BOTTOMUP-NEXT: OnlyTopDown=0 OnlyBottomUp=1
++; MIX-WITH-FLAG-BOTTOMUP: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-BOTTOMUP: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BOTTOMUP-NEXT: OnlyTopDown=0 OnlyBottomUp=1
++
++; check if the scheduling policies defined with YAML and '-misched-topdown=false' and '-misched-bottomup=false'
++; are applied
++; MIX-WITH-FLAG-BIDIRECTIONAL: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-BIDIRECTIONAL: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BIDIRECTIONAL-NEXT: RegionPolicy: ShouldTrackPressure=0 OnlyTopDown=0 OnlyBottomUp=0
++; MIX-WITH-FLAG-BIDIRECTIONAL: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-BIDIRECTIONAL: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BIDIRECTIONAL-NEXT: OnlyTopDown=0 OnlyBottomUp=1
+diff --git a/llvm/test/AutoTuning/MachineScheduler/misched_x86_topdown.ll b/llvm/test/AutoTuning/MachineScheduler/misched_x86_topdown.ll
+new file mode 100644
+index 000000000000..53c527e87e41
+--- /dev/null
++++ b/llvm/test/AutoTuning/MachineScheduler/misched_x86_topdown.ll
+@@ -0,0 +1,72 @@
++; RUN: rm %t.topdown_result %t.misched_x86_topdown.yaml -rf
++; RUN: sed  's#\[bool1\]#false#g;  s#\[bool2\]#true#g' %S/Inputs/misched_x86_template.yaml > %t.misched_x86_topdown.yaml
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_topdown.yaml\
++; RUN:           -verify-machineinstrs -debug-only=machine-scheduler  2>&1\
++; RUN:     | FileCheck %s
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_topdown.yaml\
++; RUN:           -verify-machineinstrs -misched-topdown -debug-only=machine-scheduler  2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-TOPDOWN
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
++; RUN:           -auto-tuning-input=%t.misched_x86_topdown.yaml\
++; RUN:           -verify-machineinstrs -misched-bottomup -debug-only=machine-scheduler  2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-BOTTOMUP
++
++; RUN: llc -o - %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched  \
++; RUN:           -auto-tuning-input=%t.misched_x86_topdown.yaml\
++; RUN:           -verify-machineinstrs -misched-bottomup=false -misched-topdown=false -debug-only=machine-scheduler  2>&1 \
++; RUN:     | FileCheck %s -check-prefix=MIX-WITH-FLAG-BIDIRECTIONAL
++
++; REQUIRES: asserts
++; UNSUPPORTED: windows
++;
++; Interesting MachineScheduler cases.
++
++declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
++
++define fastcc void @_preextrapolate_helper() nounwind uwtable ssp {
++entry:
++  br i1 undef, label %for.cond.preheader, label %if.end
++
++for.cond.preheader:                               ; preds = %entry
++  call void @llvm.memcpy.p0i8.p0i8.i64(i8* undef, i8* null, i64 128, i32 4, i1 false) nounwind
++  unreachable
++
++if.end:                                           ; preds = %entry
++  ret void
++}
++
++; check if the scheduling policy defined with xml is applied
++;
++; CHECK: _preextrapolate_helper:%bb.1 for.cond.preheader
++; CHECK: ScheduleDAGMILive::schedule starting
++; CHECK-NEXT: OnlyTopDown=1 OnlyBottomUp=0
++
++
++; check if the scheduling policies defined with xml and '-misched-topdown' are applied
++; MIX-WITH-FLAG-TOPDOWN: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-TOPDOWN: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-TOPDOWN-NEXT: OnlyTopDown=1 OnlyBottomUp=0
++; MIX-WITH-FLAG-TOPDOWN: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-TOPDOWN: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-TOPDOWN-NEXT: OnlyTopDown=1 OnlyBottomUp=0
++
++; check if the scheduling policies defined with xml and '-misched-bottomup' are applied
++; MIX-WITH-FLAG-BOTTOMUP: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-BOTTOMUP: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BOTTOMUP-NEXT: OnlyTopDown=0 OnlyBottomUp=1
++; MIX-WITH-FLAG-BOTTOMUP: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-BOTTOMUP: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BOTTOMUP-NEXT: OnlyTopDown=1 OnlyBottomUp=0
++
++; check if the scheduling policies defined with xml and '-misched-topdown=false' and '-misched-bottomup=false'
++; are applied
++; MIX-WITH-FLAG-BIDIRECTIONAL: _preextrapolate_helper:%bb.0 entry
++; MIX-WITH-FLAG-BIDIRECTIONAL: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BIDIRECTIONAL-NEXT: OnlyTopDown=0 OnlyBottomUp=0
++; MIX-WITH-FLAG-BIDIRECTIONAL: _preextrapolate_helper:%bb.1 for.cond.preheader
++; MIX-WITH-FLAG-BIDIRECTIONAL: ScheduleDAGMILive::schedule starting
++; MIX-WITH-FLAG-BIDIRECTIONAL-NEXT: OnlyTopDown=1 OnlyBottomUp=0
+diff --git a/llvm/test/AutoTuning/MetaData/structural_hash.ll b/llvm/test/AutoTuning/MetaData/structural_hash.ll
+new file mode 100644
+index 000000000000..2d8adca910bc
+--- /dev/null
++++ b/llvm/test/AutoTuning/MetaData/structural_hash.ll
+@@ -0,0 +1,234 @@
++; RUN: rm %t.hash_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.hash_opp -auto-tuning-type-filter=CallSite --disable-output
++; RUN: FileCheck %s --input-file %t.hash_opp/structural_hash.ll.yaml -check-prefix=META-CALL1
++; RUN: FileCheck %s --input-file %t.hash_opp/structural_hash.ll.yaml -check-prefix=META-CALL2
++; RUN: FileCheck %s --input-file %t.hash_opp/structural_hash.ll.yaml -check-prefix=META-CALL3
++
++; RUN: rm %t.hash_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-type-filter=CallSite -auto-tuning-opp=%t.hash_opp \
++; RUN:     -auto-tuning-omit-metadata --disable-output
++; RUN: FileCheck %s --input-file %t.hash_opp/structural_hash.ll.yaml -check-prefix=NO-META-CALL
++
++; UNSUPPORTED: windows
++
++; ModuleID = 'loop_small.cpp'
++source_filename = "loop_small.cpp"
++target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
++target triple = "aarch64-unknown-linux-gnu"
++
++@arr = dso_local global [1000000 x i32] zeroinitializer, align 4, !dbg !0
++
++; Function Attrs: nounwind uwtable mustprogress
++define dso_local void @_Z1fv() #0 !dbg !18 {
++entry:
++  %i = alloca i32, align 4
++  call void @llvm.dbg.declare(metadata i32* %i, metadata !21, metadata !DIExpression()), !dbg !23
++  store i32 0, i32* %i, align 4, !dbg !23
++  br label %for.cond, !dbg !24
++
++for.cond:                                         ; preds = %for.inc, %entry
++  %0 = load i32, i32* %i, align 4, !dbg !25
++  %cmp = icmp slt i32 %0, 2000, !dbg !27
++  br i1 %cmp, label %for.body, label %for.end, !dbg !28
++
++for.body:                                         ; preds = %for.cond
++  %1 = load i32, i32* %i, align 4, !dbg !29
++  %idxprom = sext i32 %1 to i64, !dbg !31
++  %arrayidx = getelementptr inbounds [1000000 x i32], [1000000 x i32]* @arr, i64 0, i64 %idxprom, !dbg !31
++  %2 = load i32, i32* %arrayidx, align 4, !dbg !32
++  %add = add nsw i32 %2, 2, !dbg !32
++  store i32 %add, i32* %arrayidx, align 4, !dbg !32
++  br label %for.inc, !dbg !33
++
++for.inc:                                          ; preds = %for.body
++  %3 = load i32, i32* %i, align 4, !dbg !34
++  %inc = add nsw i32 %3, 1, !dbg !34
++  store i32 %inc, i32* %i, align 4, !dbg !34
++  br label %for.cond, !dbg !35, !llvm.loop !36
++
++for.end:                                          ; preds = %for.cond
++  ret void, !dbg !39
++}
++
++; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
++declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
++
++; Function Attrs: nounwind uwtable mustprogress
++define dso_local void @_Z1gv() #0 !dbg !40 {
++entry:
++  %0 = load i32, i32* getelementptr inbounds ([1000000 x i32], [1000000 x i32]* @arr, i64 0, i64 0), align 4, !dbg !41
++  %inc = add nsw i32 %0, 1, !dbg !41
++  store i32 %inc, i32* getelementptr inbounds ([1000000 x i32], [1000000 x i32]* @arr, i64 0, i64 0), align 4, !dbg !41
++  ret void, !dbg !42
++}
++
++; Function Attrs: norecurse nounwind uwtable mustprogress
++define dso_local i32 @main() #2 !dbg !43 {
++entry:
++  %retval = alloca i32, align 4
++  %i = alloca i32, align 4
++  store i32 0, i32* %retval, align 4
++  call void @llvm.dbg.declare(metadata i32* %i, metadata !46, metadata !DIExpression()), !dbg !48
++  store i32 0, i32* %i, align 4, !dbg !48
++  br label %for.cond, !dbg !49
++
++for.cond:                                         ; preds = %for.inc, %entry
++  %0 = load i32, i32* %i, align 4, !dbg !50
++  %cmp = icmp slt i32 %0, 1000000, !dbg !52
++  br i1 %cmp, label %for.body, label %for.end, !dbg !53
++
++for.body:                                         ; preds = %for.cond
++  %1 = load i32, i32* %i, align 4, !dbg !54
++  %idxprom = sext i32 %1 to i64, !dbg !55
++  %arrayidx = getelementptr inbounds [1000000 x i32], [1000000 x i32]* @arr, i64 0, i64 %idxprom, !dbg !55
++  store i32 0, i32* %arrayidx, align 4, !dbg !56
++  br label %for.inc, !dbg !55
++
++for.inc:                                          ; preds = %for.body
++  %2 = load i32, i32* %i, align 4, !dbg !57
++  %inc = add nsw i32 %2, 1, !dbg !57
++  store i32 %inc, i32* %i, align 4, !dbg !57
++  br label %for.cond, !dbg !58, !llvm.loop !59
++
++for.end:                                          ; preds = %for.cond
++  call void @_Z1fv(), !dbg !61
++  call void @_Z1gv(), !dbg !62
++  call void @_Z1fv(), !dbg !63
++  %3 = load i32, i32* %retval, align 4, !dbg !64
++  ret i32 %3, !dbg !64
++}
++
++attributes #0 = { nounwind uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
++attributes #2 = { norecurse nounwind uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
++
++!llvm.dbg.cu = !{!2}
++!llvm.module.flags = !{!10, !11, !12, !13, !14, !15, !16}
++!llvm.ident = !{!17}
++
++!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
++!1 = distinct !DIGlobalVariable(name: "arr", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true)
++!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "Huawei Bisheng Compiler clang version 12.0.0 (clang-6d7704116510 flang-6d7704116510)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: None)
++!3 = !DIFile(filename: "loop_small.cpp", directory: "/home/g84189222/boole3/llvm-project/tuneTest")
++!4 = !{}
++!5 = !{!0}
++!6 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, size: 32000000, elements: !8)
++!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
++!8 = !{!9}
++!9 = !DISubrange(count: 1000000)
++!10 = !{i32 7, !"Dwarf Version", i32 4}
++!11 = !{i32 2, !"Debug Info Version", i32 3}
++!12 = !{i32 1, !"wchar_size", i32 4}
++!13 = !{i32 1, !"branch-target-enforcement", i32 0}
++!14 = !{i32 1, !"sign-return-address", i32 0}
++!15 = !{i32 1, !"sign-return-address-all", i32 0}
++!16 = !{i32 1, !"sign-return-address-with-bkey", i32 0}
++!17 = !{!"Huawei Bisheng Compiler clang version 12.0.0 (clang-6d7704116510 flang-6d7704116510)"}
++!18 = distinct !DISubprogram(name: "f", linkageName: "_Z1fv", scope: !3, file: !3, line: 3, type: !19, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !4)
++!19 = !DISubroutineType(types: !20)
++!20 = !{null}
++!21 = !DILocalVariable(name: "i", scope: !22, file: !3, line: 4, type: !7)
++!22 = distinct !DILexicalBlock(scope: !18, file: !3, line: 4, column: 2)
++!23 = !DILocation(line: 4, column: 10, scope: !22)
++!24 = !DILocation(line: 4, column: 6, scope: !22)
++!25 = !DILocation(line: 4, column: 15, scope: !26)
++!26 = distinct !DILexicalBlock(scope: !22, file: !3, line: 4, column: 2)
++!27 = !DILocation(line: 4, column: 16, scope: !26)
++!28 = !DILocation(line: 4, column: 2, scope: !22)
++!29 = !DILocation(line: 5, column: 7, scope: !30)
++!30 = distinct !DILexicalBlock(scope: !26, file: !3, line: 4, column: 27)
++!31 = !DILocation(line: 5, column: 3, scope: !30)
++!32 = !DILocation(line: 5, column: 10, scope: !30)
++!33 = !DILocation(line: 6, column: 2, scope: !30)
++!34 = !DILocation(line: 4, column: 24, scope: !26)
++!35 = !DILocation(line: 4, column: 2, scope: !26)
++!36 = distinct !{!36, !28, !37, !38}
++!37 = !DILocation(line: 6, column: 2, scope: !22)
++!38 = !{!"llvm.loop.mustprogress"}
++!39 = !DILocation(line: 7, column: 1, scope: !18)
++!40 = distinct !DISubprogram(name: "g", linkageName: "_Z1gv", scope: !3, file: !3, line: 8, type: !19, scopeLine: 8, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !4)
++!41 = !DILocation(line: 9, column: 8, scope: !40)
++!42 = !DILocation(line: 10, column: 1, scope: !40)
++!43 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 12, type: !44, scopeLine: 12, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !4)
++!44 = !DISubroutineType(types: !45)
++!45 = !{!7}
++!46 = !DILocalVariable(name: "i", scope: !47, file: !3, line: 13, type: !7)
++!47 = distinct !DILexicalBlock(scope: !43, file: !3, line: 13, column: 2)
++!48 = !DILocation(line: 13, column: 10, scope: !47)
++!49 = !DILocation(line: 13, column: 6, scope: !47)
++!50 = !DILocation(line: 13, column: 15, scope: !51)
++!51 = distinct !DILexicalBlock(scope: !47, file: !3, line: 13, column: 2)
++!52 = !DILocation(line: 13, column: 16, scope: !51)
++!53 = !DILocation(line: 13, column: 2, scope: !47)
++!54 = !DILocation(line: 13, column: 35, scope: !51)
++!55 = !DILocation(line: 13, column: 31, scope: !51)
++!56 = !DILocation(line: 13, column: 38, scope: !51)
++!57 = !DILocation(line: 13, column: 27, scope: !51)
++!58 = !DILocation(line: 13, column: 2, scope: !51)
++!59 = distinct !{!59, !53, !60, !38}
++!60 = !DILocation(line: 13, column: 40, scope: !47)
++!61 = !DILocation(line: 14, column: 2, scope: !43)
++!62 = !DILocation(line: 15, column: 2, scope: !43)
++!63 = !DILocation(line: 16, column: 2, scope: !43)
++!64 = !DILocation(line: 17, column: 1, scope: !43)
++
++; META-CALL1: --- !AutoTuning
++; META-CALL1: Pass:           inline
++; META-CALL1: Name:           _Z1fv
++; META-CALL1: DebugLoc:       { File: loop_small.cpp, Line: 14, Column: 2 }
++; META-CALL1-NEXT: Function:       main
++; META-CALL1-NEXT: CodeRegionType: callsite
++; META-CALL1-NEXT: CodeRegionHash: {{[0-9]+}}
++; META-CALL1-NEXT: DynamicConfigs: { ForceInline: [ 0, 1 ] }
++; META-CALL1-NEXT: BaselineConfig: { ForceInline: '1' }
++; META-CALL1-NEXT: Invocation:     0
++; META-CALL1-NEXT: ...
++; META-CALL2: --- !AutoTuning
++; META-CALL2: Pass:           inline
++; META-CALL2: Name:           _Z1fv
++; META-CALL2: DebugLoc:       { File: loop_small.cpp, Line: 16, Column: 2 }
++; META-CALL2-NEXT: Function:       main
++; META-CALL2-NEXT: CodeRegionType: callsite
++; META-CALL2-NEXT: CodeRegionHash: {{[0-9]+}}
++; META-CALL2-NEXT: DynamicConfigs: { ForceInline: [ 0, 1 ] }
++; META-CALL2-NEXT: BaselineConfig: { ForceInline: '1' }
++; META-CALL2-NEXT: Invocation:     0
++; META-CALL2-NEXT: ...
++; META-CALL3: --- !AutoTuning
++; META-CALL3: Pass:           inline
++; META-CALL3: Name:           _Z1gv
++; META-CALL3: DebugLoc:       { File: loop_small.cpp, Line: 15, Column: 2 }
++; META-CALL3-NEXT: Function:       main
++; META-CALL3-NEXT: CodeRegionType: callsite
++; META-CALL3-NEXT: CodeRegionHash: {{[0-9]+}}
++; META-CALL3-NEXT: DynamicConfigs: { ForceInline: [ 0, 1 ] }
++; META-CALL3-NEXT: BaselineConfig: { ForceInline: '1' }
++; META-CALL3-NEXT: Invocation:     0
++; META-CALL3-NEXT: ...
++
++; NO-META-CALL: --- !AutoTuning
++; NO-META-CALL-NEXT: Pass:           inline
++; NO-META-CALL-NEXT: CodeRegionType: callsite
++; NO-META-CALL-NEXT: CodeRegionHash: {{[0-9]+}}
++; NO-META-CALL-NEXT: DynamicConfigs: { ForceInline: [ 0, 1 ] }
++; NO-META-CALL-NEXT: BaselineConfig: { ForceInline: '1' }
++; NO-META-CALL-NEXT: Invocation:     0
++; NO-META-CALL-NEXT: ...
++; NO-META-CALL-NEXT: --- !AutoTuning
++; NO-META-CALL-NEXT: Pass:           inline
++; NO-META-CALL-NEXT: CodeRegionType: callsite
++; NO-META-CALL-NEXT: CodeRegionHash: {{[0-9]+}}
++; NO-META-CALL-NEXT: DynamicConfigs: { ForceInline: [ 0, 1 ] }
++; NO-META-CALL-NEXT: BaselineConfig: { ForceInline: '1' }
++; NO-META-CALL-NEXT: Invocation:     0
++; NO-META-CALL-NEXT: ...
++; NO-META-CALL-NEXT: --- !AutoTuning
++; NO-META-CALL-NEXT: Pass:           inline
++; NO-META-CALL-NEXT: CodeRegionType: callsite
++; NO-META-CALL-NEXT: CodeRegionHash: {{[0-9]+}}
++; NO-META-CALL-NEXT: DynamicConfigs: { ForceInline: [ 0, 1 ] }
++; NO-META-CALL-NEXT: BaselineConfig: { ForceInline: '1' }
++; NO-META-CALL-NEXT: Invocation:     0
++; NO-META-CALL-NEXT: ...
+diff --git a/llvm/test/AutoTuning/MetaData/write_no_metadata.ll b/llvm/test/AutoTuning/MetaData/write_no_metadata.ll
+new file mode 100644
+index 000000000000..344a3548a74f
+--- /dev/null
++++ b/llvm/test/AutoTuning/MetaData/write_no_metadata.ll
+@@ -0,0 +1,191 @@
++; REQUIRES: x86-registered-target
++; RUN: rm %t.default_opp -rf
++; RUN: opt %s -S -auto-tuning-opp=%t.default_opp -auto-tuning-omit-metadata=1 \
++; RUN:     -passes='require<opt-remark-emit>,loop(loop-unroll-full)' --disable-output
++; RUN: FileCheck %s --input-file %t.default_opp/write_no_metadata.ll.yaml -check-prefix=DEFAULT
++
++; RUN: rm %t.module_opp -rf
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.module_opp -auto-tuning-type-filter=Other \
++; RUN:     -auto-tuning-omit-metadata=1 --disable-output
++; RUN: FileCheck %s --input-file %t.module_opp/write_no_metadata.ll.yaml -check-prefix=OTHER
++
++; RUN: rm %t.loop_opp -rf
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.loop_opp -auto-tuning-type-filter=Loop \
++; RUN:     -auto-tuning-omit-metadata=1 --disable-output
++; RUN: FileCheck %s --input-file %t.loop_opp/write_no_metadata.ll.yaml -check-prefix=LOOP
++
++; RUN: rm %t.function_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.function_opp -auto-tuning-type-filter=CallSite \
++; RUN:     -auto-tuning-omit-metadata=1 --disable-output
++; RUN: FileCheck %s --input-file %t.function_opp/write_no_metadata.ll.yaml -check-prefix=CALLSITE
++
++; RUN: rm %t.function_loop_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.function_loop_opp -auto-tuning-omit-metadata=1 \
++; RUN:     -auto-tuning-type-filter=CallSite,Loop --disable-output
++; RUN: FileCheck %s --input-file %t.function_loop_opp/write_no_metadata.ll.yaml -check-prefix=CALLSITE-LOOP1
++; RUN: FileCheck %s --input-file %t.function_loop_opp/write_no_metadata.ll.yaml -check-prefix=CALLSITE-LOOP2
++
++; UNSUPPORTED: windows
++
++; ModuleID = 'loop-opp.c'
++source_filename = "loop-opp.c"
++target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
++target triple = "x86_64-unknown-linux-gnu"
++
++; Function Attrs: noinline nounwind uwtable
++define i32 @test(i32* %n) #0 !dbg !6 {
++entry:
++  call void @callee(i32 6), !dbg !18
++  %n.addr = alloca i32*, align 8
++  %b = alloca i32, align 4
++  %i = alloca i32, align 4
++  store i32* %n, i32** %n.addr, align 8
++  call void @llvm.dbg.declare(metadata i32** %n.addr, metadata !11, metadata !12), !dbg !13
++  call void @llvm.dbg.declare(metadata i32* %b, metadata !14, metadata !12), !dbg !15
++  store i32 0, i32* %b, align 4, !dbg !15
++  call void @llvm.dbg.declare(metadata i32* %i, metadata !16, metadata !12), !dbg !18
++  store i32 0, i32* %i, align 4, !dbg !18
++  br label %for.cond, !dbg !19
++
++for.cond:                                         ; preds = %for.inc, %entry
++  %0 = load i32, i32* %i, align 4, !dbg !20
++  %1 = load i32*, i32** %n.addr, align 8, !dbg !23
++  %2 = load i32, i32* %1, align 4, !dbg !24
++  %cmp = icmp slt i32 %0, %2, !dbg !25
++  br i1 %cmp, label %for.body, label %for.end, !dbg !26
++
++for.body:                                         ; preds = %for.cond
++  %3 = load i32, i32* %b, align 4, !dbg !28
++  %add = add nsw i32 %3, 1, !dbg !30
++  store i32 %add, i32* %b, align 4, !dbg !31
++  br label %for.inc, !dbg !32
++
++for.inc:                                          ; preds = %for.body
++  %4 = load i32, i32* %i, align 4, !dbg !33
++  %inc = add nsw i32 %4, 1, !dbg !33
++  store i32 %inc, i32* %i, align 4, !dbg !33
++  br label %for.cond, !dbg !35, !llvm.loop !36
++
++for.end:                                          ; preds = %for.cond
++  %5 = load i32, i32* %b, align 4, !dbg !39
++  ret i32 %5, !dbg !40
++}
++
++@a = global i32 4
++define void @callee(i32 %a) #2 {
++entry:
++  %a1 = load volatile i32, i32* @a
++  %x1 = add i32 %a1,  %a1
++  %add = add i32 %x1, %a
++  ret void
++}
++
++; Function Attrs: nounwind readnone
++declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
++
++attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { nounwind readnone }
++attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4}
++!llvm.ident = !{!5}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "" ,isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
++!1 = !DIFile(filename: "loop-opp.c", directory: "")
++!2 = !{}
++!3 = !{i32 2, !"Dwarf Version", i32 4}
++!4 = !{i32 2, !"Debug Info Version", i32 3}
++!5 = !{!""}
++!6 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
++!7 = !DISubroutineType(types: !8)
++!8 = !{!9, !10}
++!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
++!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64)
++!11 = !DILocalVariable(name: "n", arg: 1, scope: !6, file: !1, line: 1, type: !10)
++!12 = !DIExpression()
++!13 = !DILocation(line: 1, column: 20, scope: !6)
++!14 = !DILocalVariable(name: "b", scope: !6, file: !1, line: 3, type: !9)
++!15 = !DILocation(line: 3, column: 9, scope: !6)
++!16 = !DILocalVariable(name: "i", scope: !17, file: !1, line: 4, type: !9)
++!17 = distinct !DILexicalBlock(scope: !6, file: !1, line: 4, column: 5)
++!18 = !DILocation(line: 4, column: 14, scope: !17)
++!19 = !DILocation(line: 4, column: 10, scope: !17)
++!20 = !DILocation(line: 4, column: 20, scope: !21)
++!21 = !DILexicalBlockFile(scope: !22, file: !1, discriminator: 1)
++!22 = distinct !DILexicalBlock(scope: !17, file: !1, line: 4, column: 5)
++!23 = !DILocation(line: 4, column: 25, scope: !21)
++!24 = !DILocation(line: 4, column: 24, scope: !21)
++!25 = !DILocation(line: 4, column: 22, scope: !21)
++!26 = !DILocation(line: 4, column: 5, scope: !27)
++!27 = !DILexicalBlockFile(scope: !17, file: !1, discriminator: 1)
++!28 = !DILocation(line: 6, column: 11, scope: !29)
++!29 = distinct !DILexicalBlock(scope: !22, file: !1, line: 5, column: 5)
++!30 = !DILocation(line: 6, column: 12, scope: !29)
++!31 = !DILocation(line: 6, column: 9, scope: !29)
++!32 = !DILocation(line: 7, column: 5, scope: !29)
++!33 = !DILocation(line: 4, column: 28, scope: !34)
++!34 = !DILexicalBlockFile(scope: !22, file: !1, discriminator: 2)
++!35 = !DILocation(line: 4, column: 5, scope: !34)
++!36 = distinct !{!36, !37, !38}
++!37 = !DILocation(line: 4, column: 5, scope: !17)
++!38 = !DILocation(line: 7, column: 5, scope: !17)
++!39 = !DILocation(line: 8, column: 12, scope: !6)
++!40 = !DILocation(line: 8, column: 5, scope: !6)
++
++; DEFAULT: --- !AutoTuning
++; DEFAULT-NEXT: Pass:            loop-unroll
++; DEFAULT-NEXT: CodeRegionType:  loop
++; DEFAULT-NEXT: CodeRegionHash:  {{[0-9]+}}
++; COM: Clang generate dynamic values for UnrollCount so we use regex
++; DEFAULT-NEXT: DynamicConfigs:  { UnrollCount: [ {{[0-9]+(, [0-9]+)*}} ] }
++; DEFAULT-NEXT: BaselineConfig:  { UnrollCount: '{{[0-9]+}}' }
++; DEFAULT-NEXT: Invocation:      0
++; DEFAULT-NEXT: ...
++; DEFAULT-NEXT: --- !AutoTuning
++; DEFAULT-NEXT: Pass:            all
++; DEFAULT-NEXT: CodeRegionType:  other
++; COM: Module level hashes can differ based on the filepath so we check a regex
++; DEFAULT-NEXT: CodeRegionHash:  {{[0-9]+}}
++; DEFAULT-NEXT: DynamicConfigs:  { }
++; DEFAULT-NEXT: BaselineConfig:  { }
++; DEFAULT-NEXT: Invocation:      0
++; DEFAULT-NEXT: ...
++
++; LOOP: --- !AutoTuning
++; LOOP-NEXT: Pass:            loop-unroll
++; LOOP-NEXT: CodeRegionType:  loop
++; LOOP-NEXT: CodeRegionHash:  {{[0-9]+}}
++; COM: Clang generate dynamic values for UnrollCount so we use regex
++; LOOP-NEXT: DynamicConfigs:  { UnrollCount: [ {{[0-9]+(, [0-9]+)*}} ] }
++; LOOP-NEXT: BaselineConfig:  { UnrollCount: '{{[0-9]+}}' }
++; LOOP-NEXT: Invocation:      0
++; LOOP-NEXT: ...
++
++; CALLSITE: --- !AutoTuning
++; CALLSITE-NEXT: Pass:            inline
++; CALLSITE-NEXT: CodeRegionType:  callsite
++; CALLSITE-NEXT: CodeRegionHash:  {{[0-9]+}}
++; CALLSITE-NEXT: DynamicConfigs:  { ForceInline: [ 0, 1 ] }
++; CALLSITE-NEXT: BaselineConfig:  { ForceInline: '1' }
++; CALLSITE-NEXT: Invocation:      0
++; CALLSITE-NEXT: ...
++
++; CALLSITE-LOOP1: CodeRegionType:  loop
++; CALLSITE-LOOP1-NOT: CodeRegionType:  other
++; CALLSITE-LOOP2: CodeRegionType:  callsite
++; CALLSITE-LOOP2-NOT: CodeRegionType:  other
++
++; OTHER: --- !AutoTuning
++; OTHER-NEXT: Pass:            all
++; OTHER-NEXT: CodeRegionType:  other
++; COM: Module level hashes can differ based on the filepath so we check a regex
++; OTHER-NEXT: CodeRegionHash:  {{[0-9]+}}
++; OTHER-NEXT: DynamicConfigs:  { }
++; OTHER-NEXT: BaselineConfig:  { }
++; OTHER-NEXT: Invocation:      0
++; OTHER-NEXT: ...
+diff --git a/llvm/test/AutoTuning/MetaData/write_with_metadata.ll b/llvm/test/AutoTuning/MetaData/write_with_metadata.ll
+new file mode 100644
+index 000000000000..8b7ee9dcce37
+--- /dev/null
++++ b/llvm/test/AutoTuning/MetaData/write_with_metadata.ll
+@@ -0,0 +1,204 @@
++; REQUIRES: x86-registered-target
++; RUN: rm %t.default_opp -rf
++; RUN: opt %s -S -auto-tuning-opp=%t.default_opp -auto-tuning-omit-metadata=0 \
++; RUN:     -passes='require<opt-remark-emit>,loop(loop-unroll-full)' --disable-output
++; RUN: FileCheck %s --input-file %t.default_opp/write_with_metadata.ll.yaml -check-prefix=DEFAULT
++
++; RUN: rm %t.module_opp -rf
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.module_opp -auto-tuning-type-filter=Other \
++; RUN:     -auto-tuning-omit-metadata=0 --disable-output
++; RUN: FileCheck %s --input-file %t.module_opp/write_with_metadata.ll.yaml -check-prefix=OTHER
++
++; RUN: rm %t.loop_opp -rf
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.loop_opp -auto-tuning-type-filter=Loop \
++; RUN:     -auto-tuning-omit-metadata=0 --disable-output
++; RUN: FileCheck %s --input-file %t.loop_opp/write_with_metadata.ll.yaml -check-prefix=LOOP
++
++; RUN: rm %t.function_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.function_opp -auto-tuning-type-filter=CallSite \
++; RUN:     -auto-tuning-omit-metadata=0 --disable-output
++; RUN: FileCheck %s --input-file %t.function_opp/write_with_metadata.ll.yaml -check-prefix=CALLSITE
++
++; RUN: rm %t.function_loop_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.function_loop_opp -auto-tuning-type-filter=CallSite,Loop \
++; RUN:     -auto-tuning-omit-metadata=0 --disable-output
++; RUN: FileCheck %s --input-file %t.function_loop_opp/write_with_metadata.ll.yaml -check-prefix=CALLSITE-LOOP1
++; RUN: FileCheck %s --input-file %t.function_loop_opp/write_with_metadata.ll.yaml -check-prefix=CALLSITE-LOOP2
++
++; UNSUPPORTED: windows
++
++; ModuleID = 'loop-opp.c'
++source_filename = "loop-opp.c"
++target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
++target triple = "x86_64-unknown-linux-gnu"
++
++; Function Attrs: noinline nounwind uwtable
++define i32 @test(i32* %n) #0 !dbg !6 {
++entry:
++  call void @callee(i32 6), !dbg !18
++  %n.addr = alloca i32*, align 8
++  %b = alloca i32, align 4
++  %i = alloca i32, align 4
++  store i32* %n, i32** %n.addr, align 8
++  call void @llvm.dbg.declare(metadata i32** %n.addr, metadata !11, metadata !12), !dbg !13
++  call void @llvm.dbg.declare(metadata i32* %b, metadata !14, metadata !12), !dbg !15
++  store i32 0, i32* %b, align 4, !dbg !15
++  call void @llvm.dbg.declare(metadata i32* %i, metadata !16, metadata !12), !dbg !18
++  store i32 0, i32* %i, align 4, !dbg !18
++  br label %for.cond, !dbg !19
++
++for.cond:                                         ; preds = %for.inc, %entry
++  %0 = load i32, i32* %i, align 4, !dbg !20
++  %1 = load i32*, i32** %n.addr, align 8, !dbg !23
++  %2 = load i32, i32* %1, align 4, !dbg !24
++  %cmp = icmp slt i32 %0, %2, !dbg !25
++  br i1 %cmp, label %for.body, label %for.end, !dbg !26
++
++for.body:                                         ; preds = %for.cond
++  %3 = load i32, i32* %b, align 4, !dbg !28
++  %add = add nsw i32 %3, 1, !dbg !30
++  store i32 %add, i32* %b, align 4, !dbg !31
++  br label %for.inc, !dbg !32
++
++for.inc:                                          ; preds = %for.body
++  %4 = load i32, i32* %i, align 4, !dbg !33
++  %inc = add nsw i32 %4, 1, !dbg !33
++  store i32 %inc, i32* %i, align 4, !dbg !33
++  br label %for.cond, !dbg !35, !llvm.loop !36
++
++for.end:                                          ; preds = %for.cond
++  %5 = load i32, i32* %b, align 4, !dbg !39
++  ret i32 %5, !dbg !40
++}
++
++@a = global i32 4
++define void @callee(i32 %a) #2 {
++entry:
++  %a1 = load volatile i32, i32* @a
++  %x1 = add i32 %a1,  %a1
++  %add = add i32 %x1, %a
++  ret void
++}
++
++; Function Attrs: nounwind readnone
++declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
++
++attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { nounwind readnone }
++attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4}
++!llvm.ident = !{!5}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "" ,isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
++!1 = !DIFile(filename: "loop-opp.c", directory: "")
++!2 = !{}
++!3 = !{i32 2, !"Dwarf Version", i32 4}
++!4 = !{i32 2, !"Debug Info Version", i32 3}
++!5 = !{!""}
++!6 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
++!7 = !DISubroutineType(types: !8)
++!8 = !{!9, !10}
++!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
++!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64)
++!11 = !DILocalVariable(name: "n", arg: 1, scope: !6, file: !1, line: 1, type: !10)
++!12 = !DIExpression()
++!13 = !DILocation(line: 1, column: 20, scope: !6)
++!14 = !DILocalVariable(name: "b", scope: !6, file: !1, line: 3, type: !9)
++!15 = !DILocation(line: 3, column: 9, scope: !6)
++!16 = !DILocalVariable(name: "i", scope: !17, file: !1, line: 4, type: !9)
++!17 = distinct !DILexicalBlock(scope: !6, file: !1, line: 4, column: 5)
++!18 = !DILocation(line: 4, column: 14, scope: !17)
++!19 = !DILocation(line: 4, column: 10, scope: !17)
++!20 = !DILocation(line: 4, column: 20, scope: !21)
++!21 = !DILexicalBlockFile(scope: !22, file: !1, discriminator: 1)
++!22 = distinct !DILexicalBlock(scope: !17, file: !1, line: 4, column: 5)
++!23 = !DILocation(line: 4, column: 25, scope: !21)
++!24 = !DILocation(line: 4, column: 24, scope: !21)
++!25 = !DILocation(line: 4, column: 22, scope: !21)
++!26 = !DILocation(line: 4, column: 5, scope: !27)
++!27 = !DILexicalBlockFile(scope: !17, file: !1, discriminator: 1)
++!28 = !DILocation(line: 6, column: 11, scope: !29)
++!29 = distinct !DILexicalBlock(scope: !22, file: !1, line: 5, column: 5)
++!30 = !DILocation(line: 6, column: 12, scope: !29)
++!31 = !DILocation(line: 6, column: 9, scope: !29)
++!32 = !DILocation(line: 7, column: 5, scope: !29)
++!33 = !DILocation(line: 4, column: 28, scope: !34)
++!34 = !DILexicalBlockFile(scope: !22, file: !1, discriminator: 2)
++!35 = !DILocation(line: 4, column: 5, scope: !34)
++!36 = distinct !{!36, !37, !38}
++!37 = !DILocation(line: 4, column: 5, scope: !17)
++!38 = !DILocation(line: 7, column: 5, scope: !17)
++!39 = !DILocation(line: 8, column: 12, scope: !6)
++!40 = !DILocation(line: 8, column: 5, scope: !6)
++
++; DEFAULT: --- !AutoTuning
++; DEFAULT-NEXT: Pass:            loop-unroll
++; DEFAULT-NEXT: Name:            for.cond
++; DEFAULT-NEXT: DebugLoc:        { File: loop-opp.c, Line: 4, Column: 5 }
++; DEFAULT-NEXT: Function:        test
++; DEFAULT-NEXT: CodeRegionType:  loop
++; DEFAULT-NEXT: CodeRegionHash:  {{[0-9]+}}
++; DEFAULT-NEXT: DynamicConfigs:  { UnrollCount: [ {{[0-9]+(, [0-9]+)*}} ] }
++; DEFAULT-NEXT: BaselineConfig:  { UnrollCount: '{{[0-9]+}}' }
++; DEFAULT-NEXT: Invocation:      0
++; DEFAULT-NEXT: ...
++; DEFAULT-NEXT: --- !AutoTuning
++; DEFAULT-NEXT: Pass:            all
++; DEFAULT-NEXT: Name:
++; DEFAULT-SAME: write_with_metadata.ll
++; DEFAULT-NEXT: Function:        none
++; DEFAULT-NEXT: CodeRegionType:  other
++; COM: Module level hashes can differ based on the filepath so we check a regex
++; DEFAULT-NEXT: CodeRegionHash:  {{[0-9]+}}
++; DEFAULT-NEXT: DynamicConfigs:  { }
++; DEFAULT-NEXT: BaselineConfig:  { }
++; DEFAULT-NEXT: Invocation:      0
++; DEFAULT-NEXT: ...
++
++; LOOP: --- !AutoTuning
++; LOOP-NEXT: Pass:            loop-unroll
++; LOOP-NEXT: Name:            for.cond
++; LOOP-NEXT: DebugLoc:        { File: loop-opp.c, Line: 4, Column: 5 }
++; LOOP-NEXT: Function:        test
++; LOOP-NEXT: CodeRegionType:  loop
++; LOOP-NEXT: CodeRegionHash:  {{[0-9]+}}
++; LOOP-NEXT: DynamicConfigs:  { UnrollCount: [ {{[0-9]+(, [0-9]+)*}} ] }
++; LOOP-NEXT: BaselineConfig:  { UnrollCount: '{{[0-9]+}}' }
++; LOOP-NEXT: Invocation:      0
++; LOOP-NEXT: ...
++
++; CALLSITE: --- !AutoTuning
++; CALLSITE-NEXT: Pass:            inline
++; CALLSITE-NEXT: Name:            callee
++; CALLSITE-NEXT: DebugLoc:        { File: loop-opp.c, Line: 4, Column: 14 }
++; CALLSITE-NEXT: Function:        test
++; CALLSITE-NEXT: CodeRegionType:  callsite
++; CALLSITE-NEXT: CodeRegionHash:  {{[0-9]+}}
++; CALLSITE-NEXT: DynamicConfigs:  { ForceInline: [ 0, 1 ] }
++; CALLSITE-NEXT: BaselineConfig:  { ForceInline: '1' }
++; CALLSITE-NEXT: Invocation:      0
++; CALLSITE-NEXT: ...
++
++; CALLSITE-LOOP1: CodeRegionType:  loop
++; CALLSITE-LOOP1-NOT: CodeRegionType:  other
++; CALLSITE-LOOP2: CodeRegionType:  callsite
++; CALLSITE-LOOP2-NOT: CodeRegionType:  other
++
++; OTHER: --- !AutoTuning
++; OTHER-NEXT: Pass:            all
++; OTHER-NEXT: Name:
++; OTHER-SAME: write_with_metadata
++; OTHER-NEXT: Function:        none
++; OTHER-NEXT: CodeRegionType:  other
++; COM: Module level hashes can differ based on the filepath so we check a regex
++; OTHER-NEXT: CodeRegionHash:  {{[0-9]+}}
++; OTHER-NEXT: DynamicConfigs:  { }
++; OTHER-NEXT: BaselineConfig:  { }
++; OTHER-NEXT: Invocation:      0
++; OTHER-NEXT: ...
+diff --git a/llvm/test/AutoTuning/PGO/Inputs/pgo-instr.proftext b/llvm/test/AutoTuning/PGO/Inputs/pgo-instr.proftext
+new file mode 100644
+index 000000000000..6ed79897d78c
+--- /dev/null
++++ b/llvm/test/AutoTuning/PGO/Inputs/pgo-instr.proftext
+@@ -0,0 +1,17 @@
++# IR level Instrumentation Flag
++:ir
++hot
++# Func Hash:
++12884901887
++# Num Counters:
++1
++# Counter Values:
++9000
++
++cold
++# Func Hash:
++12884901887
++# Num Counters:
++1
++# Counter Values:
++10
+diff --git a/llvm/test/AutoTuning/PGO/Inputs/pgo-sample-cold.prof b/llvm/test/AutoTuning/PGO/Inputs/pgo-sample-cold.prof
+new file mode 100644
+index 000000000000..a1cb2231992e
+--- /dev/null
++++ b/llvm/test/AutoTuning/PGO/Inputs/pgo-sample-cold.prof
+@@ -0,0 +1,7 @@
++main:225715:0
++ 2.1: 5553
++ 3: 5391
++ 3.1: _Z3sumii:0
++  0: 0
++  1: 0
++  2: 0
+diff --git a/llvm/test/AutoTuning/PGO/Inputs/pgo-sample-hot.prof b/llvm/test/AutoTuning/PGO/Inputs/pgo-sample-hot.prof
+new file mode 100644
+index 000000000000..386cdf8a7b5e
+--- /dev/null
++++ b/llvm/test/AutoTuning/PGO/Inputs/pgo-sample-hot.prof
+@@ -0,0 +1,7 @@
++main:225715:0
++ 2.1: 5553
++ 3: 5391
++ 3.1: _Z3sumii:5860
++  0: 5279
++  1: 5279
++  2: 5279
+diff --git a/llvm/test/AutoTuning/PGO/pgo-instr-filters.ll b/llvm/test/AutoTuning/PGO/pgo-instr-filters.ll
+new file mode 100644
+index 000000000000..6b279df18343
+--- /dev/null
++++ b/llvm/test/AutoTuning/PGO/pgo-instr-filters.ll
+@@ -0,0 +1,61 @@
++; RUN: rm %t.default-opp -rf
++; RUN: llvm-profdata merge %S/Inputs/pgo-instr.proftext -o %t.profdata
++; RUN: opt %s -passes='pgo-instr-use,inline' -pgo-test-profile-file=%t.profdata -S -auto-tuning-opp=%t.default-opp -auto-tuning-exclude-cold=false --disable-output
++; RUN: FileCheck  %s  --input-file %t.default-opp/pgo-instr-filters.ll.yaml  -check-prefix=NON-FILTER
++
++; RUN: rm %t.filtered-opp -rf
++; RUN: llvm-profdata merge %S/Inputs/pgo-instr.proftext -o %t.profdata
++; RUN: opt %s -passes='pgo-instr-use,inline' -pgo-test-profile-file=%t.profdata -S -auto-tuning-opp=%t.filtered-opp -auto-tuning-exclude-cold --disable-output -pgo-instr-old-cfg-hashing=true
++; RUN: FileCheck  %s  --input-file %t.filtered-opp/pgo-instr-filters.ll.yaml  -check-prefix=EXCLUDE-COLD
++
++; RUN: rm %t.filtered-opp -rf
++; RUN: llvm-profdata merge %S/Inputs/pgo-instr.proftext -o %t.profdata
++; RUN: opt %s -passes='pgo-instr-use,inline' -pgo-test-profile-file=%t.profdata -S -auto-tuning-opp=%t.filtered-opp -auto-tuning-hot-only --disable-output -pgo-instr-old-cfg-hashing=true
++; RUN: FileCheck  %s  --input-file %t.filtered-opp/pgo-instr-filters.ll.yaml  -check-prefix=HOT-ONLY
++
++target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
++target triple = "x86_64-unknown-linux-gnu"
++
++@s = common dso_local local_unnamed_addr global i32 0, align 4
++
++define void @cold() {
++
++entry:
++  %0 = tail call i32 @callee(i32 5)
++  store i32 1, i32* @s, align 4
++  ret void
++}
++
++define void @hot() {
++entry:
++  %0 = load i32, i32* @s, align 4
++  %1 = tail call i32 @callee(i32 5)
++  %add = add nsw i32 %0, 4
++  store i32 %add, i32* @s, align 4
++  ret void
++}
++
++define void @unknown() {
++entry:
++  %0 = tail call i32 @callee(i32 5)
++  store i32 1, i32* @s, align 4
++  ret void
++}
++
++define i32 @callee(i32 %a) {
++entry:
++  %add = add nsw i32 %a, 4
++  ret i32 %add
++}
++
++; NON-FILTER-DAG: Function:        cold
++; NON-FILTER-DAG: Function:        hot
++; NON-FILTER-DAG: Function:        unknown
++
++; EXCLUDE-COLD-NOT: Function:        cold
++; EXCLUDE-COLD-DAG: Function:        hot
++; EXCLUDE-COLD-DAG: Function:        unknown
++
++; HOT-ONLY-NOT: Function:        unknown
++; HOT-ONLY-NOT: Function:        cold
++; HOT-ONLY-DAG: Function:        hot
+diff --git a/llvm/test/AutoTuning/PGO/pgo-sample-filters.ll b/llvm/test/AutoTuning/PGO/pgo-sample-filters.ll
+new file mode 100644
+index 000000000000..aa93299a7079
+--- /dev/null
++++ b/llvm/test/AutoTuning/PGO/pgo-sample-filters.ll
+@@ -0,0 +1,138 @@
++; RUN: rm %t.default-opp -rf
++; RUN: opt %s -passes='sample-profile,inline' -sample-profile-file=%S/Inputs/pgo-sample-cold.prof -auto-tuning-opp=%t.default-opp -auto-tuning-exclude-cold=false --disable-output -S
++; RUN: FileCheck %s -check-prefix=NON-FILTER < %t.default-opp/pgo-sample-filters.ll.yaml
++
++; Test -auto-tuning-exclude-cold with a cold caller in sample profile.
++; RUN: rm %t.filtered-opp -rf
++; RUN: opt %s -passes='sample-profile,inline' -sample-profile-file=%S/Inputs/pgo-sample-cold.prof -auto-tuning-opp=%t.filtered-opp -auto-tuning-exclude-cold --disable-output -S
++; RUN: FileCheck %s -check-prefix=COLD-PROFILE-EXCLUDE-COLD < %t.filtered-opp/pgo-sample-filters.ll.yaml
++
++; Test -auto-tuning-hot-only with a cold caller in sample profile.
++; RUN: rm %t.filtered-opp -rf
++; RUN: opt %s -passes='sample-profile,inline' -sample-profile-file=%S/Inputs/pgo-sample-cold.prof -auto-tuning-opp=%t.filtered-opp -auto-tuning-hot-only --disable-output -S
++; RUN: FileCheck %s -check-prefix=COLD-PROFILE-HOT-ONLY < %t.filtered-opp/pgo-sample-filters.ll.yaml
++
++; Test -auto-tuning-exclude-cold with a hot caller in sample profile.
++; RUN: rm %t.filtered-opp -rf
++; RUN: opt %s -passes='sample-profile,inline' -sample-profile-file=%S/Inputs/pgo-sample-hot.prof -auto-tuning-opp=%t.filtered-opp -auto-tuning-exclude-cold --disable-output -S
++; RUN: FileCheck %s -check-prefix=HOT-PROFILE-EXCLUDE-COLD < %t.filtered-opp/pgo-sample-filters.ll.yaml
++
++; Test -auto-tuning-hot-only with a hot caller in sample profile.
++; RUN: rm %t.filtered-opp -rf
++; RUN: opt %s -passes='sample-profile,inline' -sample-profile-file=%S/Inputs/pgo-sample-hot.prof -auto-tuning-opp=%t.filtered-opp -auto-tuning-hot-only --disable-output -S
++; RUN: FileCheck %s -check-prefix=HOT-PROFILE-HOT-ONLY < %t.filtered-opp/pgo-sample-filters.ll.yaml
++
++
++@.str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1
++
++define i32 @_Z3sumii(i32 %x, i32 %y) #0 !dbg !6 {
++entry:
++  %0 = tail call i32 @callee(i32 5)
++  %x.addr = alloca i32, align 4
++  %y.addr = alloca i32, align 4
++  store i32 %x, i32* %x.addr, align 4
++  store i32 %y, i32* %y.addr, align 4
++  %tmp = load i32, i32* %x.addr, align 4, !dbg !8
++  %tmp1 = load i32, i32* %y.addr, align 4, !dbg !8
++  %add = add nsw i32 %tmp, %tmp1, !dbg !8
++  ret i32 %add, !dbg !8
++}
++
++define i32 @main() #0 !dbg !9 {
++entry:
++  %0 = tail call i32 @callee(i32 5)
++  %retval = alloca i32, align 4
++  %s = alloca i32, align 4
++  %i = alloca i32, align 4
++  store i32 0, i32* %retval
++  store i32 0, i32* %i, align 4, !dbg !10
++  br label %while.cond, !dbg !11
++
++while.cond:                                       ; preds = %if.end, %entry
++  %tmp = load i32, i32* %i, align 4, !dbg !12
++  %inc = add nsw i32 %tmp, 1, !dbg !12
++  store i32 %inc, i32* %i, align 4, !dbg !12
++  %cmp = icmp slt i32 %tmp, 400000000, !dbg !12
++  br i1 %cmp, label %while.body, label %while.end, !dbg !12
++
++while.body:                                       ; preds = %while.cond
++  %tmp1 = load i32, i32* %i, align 4, !dbg !14
++  %cmp1 = icmp ne i32 %tmp1, 100, !dbg !14
++  br i1 %cmp1, label %if.then, label %if.else, !dbg !14
++
++if.then:                                          ; preds = %while.body
++  %tmp2 = load i32, i32* %i, align 4, !dbg !16
++  %tmp3 = load i32, i32* %s, align 4, !dbg !16
++  %call = call i32 @_Z3sumii(i32 %tmp2, i32 %tmp3), !dbg !16
++; INLINE-NOT: call i32 @_Z3sumii
++; NOTINLINE: call i32 @_Z3sumii
++  store i32 %call, i32* %s, align 4, !dbg !16
++  br label %if.end, !dbg !16
++
++if.else:                                          ; preds = %while.body
++  store i32 30, i32* %s, align 4, !dbg !18
++  br label %if.end
++
++if.end:                                           ; preds = %if.else, %if.then
++  br label %while.cond, !dbg !20
++
++while.end:                                        ; preds = %while.cond
++  %tmp4 = load i32, i32* %s, align 4, !dbg !22
++  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i32 %tmp4), !dbg !22
++  ret i32 0, !dbg !23
++}
++
++define i32 @callee(i32 %a) #0 {
++entry:
++  %add = add nsw i32 %a, 4
++  ret i32 %add
++}
++
++declare i32 @printf(i8*, ...)
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4}
++!llvm.ident = !{!5}
++
++attributes #0 = {"use-sample-profile"}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.5 ", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
++!1 = !DIFile(filename: "calls.cc", directory: ".")
++!2 = !{}
++!3 = !{i32 2, !"Dwarf Version", i32 4}
++!4 = !{i32 1, !"Debug Info Version", i32 3}
++!5 = !{!"clang version 3.5 "}
++!6 = distinct !DISubprogram(name: "sum", scope: !1, file: !1, line: 3, type: !7, scopeLine: 3, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
++!7 = !DISubroutineType(types: !2)
++!8 = !DILocation(line: 4, scope: !6)
++!9 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 7, type: !7, scopeLine: 7, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
++!10 = !DILocation(line: 8, scope: !9)
++!11 = !DILocation(line: 9, scope: !9)
++!12 = !DILocation(line: 9, scope: !13)
++!13 = !DILexicalBlockFile(scope: !9, file: !1, discriminator: 2)
++!14 = !DILocation(line: 10, scope: !15)
++!15 = distinct !DILexicalBlock(scope: !9, file: !1, line: 10)
++!16 = !DILocation(line: 10, scope: !17)
++!17 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 2)
++!18 = !DILocation(line: 10, scope: !19)
++!19 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 4)
++!20 = !DILocation(line: 10, scope: !21)
++!21 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 6)
++!22 = !DILocation(line: 11, scope: !9)
++!23 = !DILocation(line: 12, scope: !9)
++
++; Note that hotness of main is unknown.
++; NON-FILTER-DAG: Function:        _Z3sumii
++; NON-FILTER-DAG: Function:        main
++
++; COLD-PROFILE-EXCLUDE-COLD-NOT: Function:        _Z3sumii
++; COLD-PROFILE-EXCLUDE-COLD-DAG: Function:        main
++
++; COLD-PROFILE-HOT-ONLY-NOT: Function:        _Z3sumii
++; COLD-PROFILE-HOT-ONLY-NOT: Function:        main
++
++; HOT-PROFILE-EXCLUDE-COLD-DAG: Function:        _Z3sumii
++; HOT-PROFILE-EXCLUDE-COLD-DAG: Function:        main
++
++; HOT-PROFILE-HOT-ONLY-NOT: Function:        main
++; HOT-PROFILE-HOT-ONLY-DAG: Function:        _Z3sumii
+diff --git a/llvm/test/AutoTuning/PassInvocation/Inputs/pass_invocation.yaml b/llvm/test/AutoTuning/PassInvocation/Inputs/pass_invocation.yaml
+new file mode 100644
+index 000000000000..00459fe9e23c
+--- /dev/null
++++ b/llvm/test/AutoTuning/PassInvocation/Inputs/pass_invocation.yaml
+@@ -0,0 +1,10 @@
++--- !AutoTuning
++Pass:            loop-unroll
++Name:            for.body
++Function:        find
++CodeRegionType:  loop
++CodeRegionHash:  145363925920731080
++Invocation:      [number]
++Args:
++ - UnrollCount:  2
++...
+diff --git a/llvm/test/AutoTuning/PassInvocation/pass_invocation_read.ll b/llvm/test/AutoTuning/PassInvocation/pass_invocation_read.ll
+new file mode 100644
+index 000000000000..6e41507af8b8
+--- /dev/null
++++ b/llvm/test/AutoTuning/PassInvocation/pass_invocation_read.ll
+@@ -0,0 +1,64 @@
++; RUN: rm %t.config.yaml -rf
++; RUN: sed 's#\[number\]#0#g;' %S/Inputs/pass_invocation.yaml > %t.config.yaml
++; RUN: opt %s -S -O3 -print-after=loop-unroll-full -print-after=loop-unroll \
++; RUN:     -auto-tuning-code-region-matching-hash=false \
++; RUN:     -auto-tuning-input=%t.config.yaml --disable-output 2>&1 | \
++; RUN:     FileCheck %s --check-prefix=INVOCATION-0
++
++; RUN: rm %t.config.yaml -rf
++; RUN: sed 's#\[number\]#1#g;' %S/Inputs/pass_invocation.yaml > %t.config.yaml
++; RUN: opt %s -S -O3 -print-after=loop-unroll-full -print-after=loop-unroll \
++; RUN:     -auto-tuning-code-region-matching-hash=false \
++; RUN:     -auto-tuning-input=%t.config.yaml --disable-output 2>&1 | \
++; RUN:     FileCheck %s --check-prefix=INVOCATION-1
++
++; Function Attrs: norecurse nounwind readonly uwtable
++define dso_local i64 @find(i64* nocapture readonly %a, i64 %n, i64 %Value) {
++entry:
++  %cmp6.not = icmp eq i64 %n, 0
++  br i1 %cmp6.not, label %for.end, label %for.body
++
++for.body:                                         ; preds = %entry, %for.inc
++  %i.07 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
++  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %i.07
++  %0 = load i64, i64* %arrayidx, align 8
++  %cmp1 = icmp eq i64 %0, %Value
++  br i1 %cmp1, label %for.end, label %for.inc
++
++for.inc:                                          ; preds = %for.body
++  %inc = add nuw i64 %i.07, 1
++  %cmp = icmp ult i64 %inc, %n
++  br i1 %cmp, label %for.body, label %for.end
++
++for.end:                                          ; preds = %for.inc, %for.body, %entry
++  %i.0.lcssa = phi i64 [ 0, %entry ], [ %i.07, %for.body ], [ %inc, %for.inc ]
++  ret i64 %i.0.lcssa
++}
++
++; INVOCATION-0: *** IR Dump After {{.*}}Unroll
++; INVOCATION-0: for.body.preheader:                               ; preds = %entry
++; INVOCATION-0: for.body:                                         ; preds = %for.inc.1, %for.body.preheader
++; INVOCATION-0: for.inc:                                          ; preds = %for.body
++; INVOCATION-0: for.body.1:                                       ; preds = %for.inc
++; INVOCATION-0: for.inc.1:                                        ; preds = %for.body.1
++; INVOCATION-0: for.end.loopexit:                                 ; preds = %for.inc.1, %for.body.1, %for.body, %for.inc
++; INVOCATION-0: *** IR Dump After {{.*}}Unroll
++; INVOCATION-0: for.body.preheader:                               ; preds = %entry
++; INVOCATION-0: for.body:                                         ; preds = %for.body.preheader, %for.inc.1
++; INVOCATION-0: for.inc:                                          ; preds = %for.body
++; INVOCATION-0: for.body.1:                                       ; preds = %for.inc
++; INVOCATION-0: for.inc.1:                                        ; preds = %for.body.1
++; INVOCATION-0: for.end.loopexit:                                 ; preds = %for.inc.1, %for.body.1, %for.body, %for.inc
++
++; INVOCATION-1: *** IR Dump After {{.*}}Unroll
++; INVOCATION-1: for.body.preheader:                               ; preds = %entry
++; INVOCATION-1: for.body:                                         ; preds = %for.body.preheader, %for.inc
++; INVOCATION-1: for.inc:                                          ; preds = %for.body
++; INVOCATION-1: for.end.loopexit:                                 ; preds = %for.body, %for.inc
++; INVOCATION-1: *** IR Dump After {{.*}}Unroll
++; INVOCATION-1: for.body.preheader:                               ; preds = %entry
++; INVOCATION-1: for.body:                                         ; preds = %for.inc.1, %for.body.preheader
++; INVOCATION-1: for.inc:                                          ; preds = %for.body
++; INVOCATION-1: for.body.1:                                       ; preds = %for.inc
++; INVOCATION-1: for.inc.1:                                        ; preds = %for.body.1
++; INVOCATION-1: for.end.loopexit:                                 ; preds = %for.inc.1, %for.body.1, %for.body, %for.inc
+diff --git a/llvm/test/AutoTuning/PassInvocation/pass_invocation_write.ll b/llvm/test/AutoTuning/PassInvocation/pass_invocation_write.ll
+new file mode 100644
+index 000000000000..81097fdd5afa
+--- /dev/null
++++ b/llvm/test/AutoTuning/PassInvocation/pass_invocation_write.ll
+@@ -0,0 +1,67 @@
++; REQUIRES: aarch64-registered-target
++; RUN: rm %t.pass_invocation -rf
++; RUN: opt %s -S -mtriple=aarch64-- -mcpu=tsv110 -auto-tuning-type-filter=Loop \
++; RUN:     -O3 -auto-tuning-opp=%t.pass_invocation --disable-output
++; RUN: FileCheck  %s  --input-file %t.pass_invocation/pass_invocation_write.ll.yaml
++
++; Function Attrs: nounwind uwtable
++define dso_local void @sum(i32* noalias %a, i32* noalias %b, i32* noalias %c, i32 %n) {
++entry:
++  br label %for.cond
++
++for.cond:                                         ; preds = %for.body, %entry
++  %sum.0 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
++  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
++  %cmp = icmp slt i32 %i.0, %n
++  br i1 %cmp, label %for.body, label %for.end
++
++for.body:                                         ; preds = %for.cond
++  %idxprom = sext i32 %i.0 to i64
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
++  %0 = load i32, i32* %arrayidx, align 4
++  %idxprom1 = sext i32 %i.0 to i64
++  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %idxprom1
++  %1 = load i32, i32* %arrayidx2, align 4
++  %mul = mul nsw i32 %0, %1
++  %conv = sitofp i32 %mul to float
++  %add = fadd contract float %sum.0, %conv
++  %inc = add nsw i32 %i.0, 1
++  br label %for.cond
++
++for.end:                                          ; preds = %for.cond
++  %conv3 = fptosi float %sum.0 to i32
++  %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 0
++  store i32 %conv3, i32* %arrayidx4, align 4
++  ret void
++}
++
++; CHECK: --- !AutoTuning
++; CHECK-NEXT: Pass:            loop-unroll
++; CHECK-NEXT: Name:            for.body
++; CHECK-NEXT: Function:        sum
++; CHECK-NEXT: CodeRegionType:  loop
++; CHECK-NEXT: CodeRegionHash:  {{[0-9]+}}
++; CHECK-NEXT: DynamicConfigs:  { UnrollCount: [ 0, 1, 8, 4, 2 ] }
++; CHECK-NEXT: BaselineConfig:  { UnrollCount: '0' }
++; CHECK-NEXT: Invocation:      0
++; CHECK-NEXT: ...
++; CHECK-NEXT: --- !AutoTuning
++; CHECK-NEXT: Pass:            loop-vectorize
++; CHECK-NEXT: Name:            for.body
++; CHECK-NEXT: Function:        sum
++; CHECK-NEXT: CodeRegionType:  loop
++; CHECK-NEXT: CodeRegionHash:  {{[0-9]+}}
++; CHECK-NEXT: DynamicConfigs:  { VectorizationInterleave: [ 1, 2, 4 ] }
++; CHECK-NEXT: BaselineConfig:  { VectorizationInterleave: '2' }
++; CHECK-NEXT: Invocation:      0
++; CHECK-NEXT: ...
++; CHECK-NEXT: --- !AutoTuning
++; CHECK-NEXT: Pass:            loop-unroll
++; CHECK-NEXT: Name:            vector.body
++; CHECK-NEXT: Function:        sum
++; CHECK-NEXT: CodeRegionType:  loop
++; CHECK-NEXT: CodeRegionHash:  {{[0-9]+}}
++; CHECK-NEXT: DynamicConfigs:  { UnrollCount: [ 0, 1, 8, 4, 2 ] }
++; CHECK-NEXT: BaselineConfig:  { UnrollCount: '0' }
++; CHECK-NEXT: Invocation:      1
++; CHECK-NEXT: ...
+diff --git a/llvm/test/AutoTuning/PhaseOrdering/Inputs/template.yaml b/llvm/test/AutoTuning/PhaseOrdering/Inputs/template.yaml
+new file mode 100644
+index 000000000000..065d3cb85b72
+--- /dev/null
++++ b/llvm/test/AutoTuning/PhaseOrdering/Inputs/template.yaml
+@@ -0,0 +1,8 @@
++--- !AutoTuning
++Pass:            all
++Name:            [filename]
++Function:        none
++CodeRegionType:  other
++Args:
++  - OptPass: [pass]
++...
+diff --git a/llvm/test/AutoTuning/PhaseOrdering/pass-order.ll b/llvm/test/AutoTuning/PhaseOrdering/pass-order.ll
+new file mode 100644
+index 000000000000..9d0210b3fdde
+--- /dev/null
++++ b/llvm/test/AutoTuning/PhaseOrdering/pass-order.ll
+@@ -0,0 +1,65 @@
++; Run different orders of opt passes and verify that the order is respected
++; -------------------------------------------------------------------------
++; Check to see if the order is correct, trivial case (autotuning disabled)
++; RUN: opt %s -debug-pass-manager -S 2>&1 | FileCheck %s -check-prefix=DISABLE
++
++; One pass:
++; RUN: rm %t.onepass_order.yaml -rf
++; RUN: sed 's#\[filename\]#%s#g; s#\[pass\]#\[loop-extract\]#g' \
++; RUN:    %S/Inputs/template.yaml > %t.onepass_order.yaml
++; RUN: opt %s -debug-pass-manager -S -auto-tuning-input=%t.onepass_order.yaml \
++; RUN:    2>&1 | FileCheck %s -check-prefix=ONEPASS
++
++; Two passes (A->B):
++; RUN: rm %t.twopass_order.yaml -rf
++; RUN: sed 's#\[filename\]#%s#g; s#\[pass\]#\[loop-extract,strip\]#g' \
++; RUN:    %S/Inputs/template.yaml > %t.twopass_order.yaml
++; RUN: opt %s -debug-pass-manager -S -auto-tuning-input=%t.twopass_order.yaml \
++; RUN:    2>&1 | FileCheck %s -check-prefix=TWOPASS_AB
++
++; Two passes (B->A):
++; RUN: rm %t.twopass_ba_order.yaml -rf
++; RUN: sed 's#\[filename\]#%s#g; s#\[pass\]#\[strip, loop-extract\]#g' \
++; RUN:    %S/Inputs/template.yaml > %t.twopass_ba_order.yaml
++; RUN: opt %s -debug-pass-manager -S -auto-tuning-input=%t.twopass_ba_order.yaml \
++; RUN:    2>&1 | FileCheck %s -check-prefix=TWOPASS_BA
++
++; candidate IR that can change based on many optimizations
++; for now just use the IR in the LoopUnroll test file
++define void @foo(i32* nocapture %a) {
++entry:
++  br label %for.body
++
++for.body:                                         ; preds = %for.body, %entry
++  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
++  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
++  %0 = load i32, i32* %arrayidx, align 4
++  %inc = add nsw i32 %0, 1
++  store i32 %inc, i32* %arrayidx, align 4
++  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
++  %exitcond = icmp eq i64 %indvars.iv.next, 64
++  br i1 %exitcond, label %for.end, label %for.body
++
++for.end:                                          ; preds = %for.body
++  ret void
++}
++
++; DISABLE-NOT: Running pass: LoopExtractorPass on [module]
++; DISABLE-NOT: Running pass: StripSymbolsPass on [module] 
++; DISABLE: Running pass: VerifierPass on [module]
++; DISABLE: Running pass: PrintModulePass on [module]
++
++; ONEPASS-NOT: Running pass: StripSymbolsPass on [module]
++; ONEPASS: Running pass: LoopExtractorPass on [module]
++; ONEPASS: Running pass: VerifierPass on [module]
++; ONEPASS: Running pass: PrintModulePass on [module]
++
++; TWOPASS_AB: Running pass: LoopExtractorPass on [module]
++; TWOPASS_AB: Running pass: StripSymbolsPass on [module]
++; TWOPASS_AB: Running pass: VerifierPass on [module]
++; TWOPASS_AB: Running pass: PrintModulePass on [module]
++
++; TWOPASS_BA: Running pass: StripSymbolsPass on [module]
++; TWOPASS_BA: Running pass: LoopExtractorPass on [module]
++; TWOPASS_BA: Running pass: VerifierPass on [module]
++; TWOPASS_BA: Running pass: PrintModulePass on [module]
+diff --git a/llvm/test/AutoTuning/SwitchLowering/switch-opp.ll b/llvm/test/AutoTuning/SwitchLowering/switch-opp.ll
+new file mode 100644
+index 000000000000..679549180bf4
+--- /dev/null
++++ b/llvm/test/AutoTuning/SwitchLowering/switch-opp.ll
+@@ -0,0 +1,47 @@
++; RUN: rm %t.switch_opp -rf
++; RUN: llc %s -auto-tuning-opp=%t.switch_opp -auto-tuning-type-filter=Switch -o /dev/null
++; RUN: FileCheck %s --input-file %t.switch_opp/switch-opp.ll.yaml
++
++; UNSUPPORTED: windows
++
++define i32 @test(i32 %arg) #0 {
++entry:
++  switch i32 %arg, label %bb5 [
++    i32 1, label %bb1
++    i32 2, label %bb2
++    i32 3, label %bb3
++    i32 4, label %bb4
++  ]
++
++bb1:			; pred = %entry
++  br label %bb2
++
++bb2:			; pred = %entry, %bb1
++  %res.0 = phi i32 [ 1, %entry ], [ 2, %bb1 ]
++  br label %bb3
++
++bb3:			; pred = %entry, %bb2
++  %res.1 = phi i32 [ 0, %entry ], [ %res.0, %bb2 ]
++  %phitmp = add nsw i32 %res.1, 2
++  br label %bb4
++
++bb4:			; pred = %entry, %bb3
++  %res.2 = phi i32 [ 1, %entry ], [ %phitmp, %bb3 ]
++  br label %bb5
++
++bb5:			; pred = %entry, %bb4
++  %res.3 = phi i32 [ 0, %entry ], [ %res.2, %bb4 ]
++  %0 = add nsw i32 %res.3, 1
++  ret i32 %0
++}
++
++; CHECK: --- !AutoTuning
++; CHECK-NEXT: Pass:            switch-lowering
++; CHECK-NEXT: Name:            'i32 %arg'
++; CHECK-NEXT: Function:        test
++; CHECK-NEXT: CodeRegionType:  switch
++; CHECK-NEXT: CodeRegionHash:  {{[0-9]+}}
++; CHECK-NEXT: DynamicConfigs:  { }
++; CHECK-NEXT: BaselineConfig:  { }
++; CHECK-NEXT: Invocation:      0
++; CHECK-NEXT: ...
+diff --git a/llvm/test/AutoTuning/lit.local.cfg b/llvm/test/AutoTuning/lit.local.cfg
+new file mode 100644
+index 000000000000..13b4927257ab
+--- /dev/null
++++ b/llvm/test/AutoTuning/lit.local.cfg
+@@ -0,0 +1,2 @@
++if not config.enable_enable_autotuner:
++    config.unsupported = True
+diff --git a/llvm/test/AutoTuning/opt-opp.ll b/llvm/test/AutoTuning/opt-opp.ll
+new file mode 100644
+index 000000000000..97f7b1d121cc
+--- /dev/null
++++ b/llvm/test/AutoTuning/opt-opp.ll
+@@ -0,0 +1,315 @@
++; REQUIRES: asserts
++; REQUIRES: x86-registered-target
++
++; RUN: rm %t.default_opp -rf
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.default_opp --disable-output
++; RUN: FileCheck %s --input-file %t.default_opp/opt-opp.ll.yaml  -check-prefix=DEFAULT
++
++; RUN: rm %t.module_opp -rf
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.module_opp -auto-tuning-type-filter=Other --disable-output
++; RUN: FileCheck %s --input-file %t.module_opp/opt-opp.ll.yaml -check-prefix=OTHER
++
++; RUN: rm %t.loop_opp -rf
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.loop_opp -auto-tuning-type-filter=Loop --disable-output
++; RUN: FileCheck %s --input-file %t.loop_opp/opt-opp.ll.yaml -check-prefix=LOOP
++
++; RUN: rm %t.callsite_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.callsite_opp -auto-tuning-type-filter=CallSite --disable-output
++; RUN: FileCheck %s --input-file %t.callsite_opp/opt-opp.ll.yaml -check-prefix=CALLSITE
++
++; RUN: rm %t.callsite_loop_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.callsite_loop_opp -auto-tuning-type-filter=CallSite,Loop --disable-output
++; RUN: FileCheck %s --input-file %t.callsite_loop_opp/opt-opp.ll.yaml -check-prefix=CALLSITE-LOOP1
++; RUN: FileCheck %s --input-file %t.callsite_loop_opp/opt-opp.ll.yaml -check-prefix=CALLSITE-LOOP2
++
++; RUN: rm %t.llvm_param_opp -rf
++; RUN: opt %s -S -auto-tuning-opp=%t.llvm_param_opp \
++; RUN:     -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-type-filter=LLVMParam --disable-output
++; RUN: FileCheck %s --input-file %t.llvm_param_opp/opt-opp.ll.yaml -check-prefix=LLVMPARAM
++
++; RUN: rm %t.program_param_opp -rf
++; RUN: opt %s -S -passes='function(require<opt-remark-emit>,loop-unroll),cgscc(inline)' \
++; RUN:     -auto-tuning-opp=%t.program_param_opp -auto-tuning-type-filter=ProgramParam --disable-output
++; RUN: FileCheck %s --input-file %t.program_param_opp/opt-opp.ll.yaml -check-prefix=ProgramPARAM
++
++; Test if opp file with the same name exists already
++; RUN: rm %t.default_opp -rf
++; RUN: mkdir %t.default_opp && touch %t.default_opp/opt-opp.ll.yaml
++; RUN: opt %s -S -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -auto-tuning-opp=%t.default_opp --disable-output
++; RUN: FileCheck %s --input-file %t.default_opp/opt-opp.ll.yaml.1 -check-prefix=DEFAULT
++
++; Test that the loop code region is included if its size >= the threshold.
++; RUN: rm %t.loop.opp -rf
++; RUN: opt %s -S -auto-tuning-opp=%t.loop.opp -auto-tuning-size-threshold=13 \
++; RUN:     -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -debug-only=autotuning --disable-output 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=SIZE-LOOP
++; RUN: FileCheck %s --input-file %t.loop.opp/opt-opp.ll.yaml -check-prefix=SIZE-LOOP-OPP
++
++; Test that the loop code region is excluded if its size < the threshold.
++; RUN: rm %t.loop.opp -rf
++; RUN: opt %s -S -auto-tuning-opp=%t.loop.opp -auto-tuning-size-threshold=14 \
++; RUN:     -passes='require<opt-remark-emit>,loop(loop-unroll-full)' \
++; RUN:     -debug-only=autotuning --disable-output 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=SIZE-LOOP-FILTERED
++; RUN: FileCheck %s --input-file %t.loop.opp/opt-opp.ll.yaml -check-prefix=SIZE-LOOP-OPP-FILTERED
++
++; Test that the callsite code region is included if its size >= the threshold.
++; RUN: rm %t.callsite.opp -rf
++; RUN: opt %s -S -passes=inline -auto-tuning-opp=%t.callsite.opp --disable-output \
++; RUN:     -auto-tuning-size-threshold=2 -debug-only=autotuning 2>&1 | \
++; RUN:     FileCheck %s -check-prefix=SIZE-CALLSITE
++; RUN: FileCheck %s --input-file %t.callsite.opp/opt-opp.ll.yaml -check-prefix=SIZE-CALLSITE-OPP
++
++; Test that the callsite code region is excluded if its size < the threshold.
++; RUN: rm %t.callsite.opp -rf
++; RUN: opt %s -S -passes=inline -auto-tuning-opp=%t.callsite.opp \
++; RUN:     -auto-tuning-size-threshold=24 --disable-output -debug-only=autotuning \
++; RUN:     2>&1 | FileCheck %s -check-prefix=SIZE-CALLSITE-FILTERED
++; RUN: FileCheck %s --input-file %t.callsite.opp/opt-opp.ll.yaml -check-prefix=SIZE-CALLSITE-OPP-FILTERED
++
++; RUN: rm -rf %t.other
++; RUN: opt %s -S -O3  -auto-tuning-opp=%t.other -auto-tuning-type-filter=Other
++; RUN: grep "Name: \+'%S/opt-opp.ll'" %t.other/opt-opp.ll.yaml
++; RUN: not grep "Name: \+opt-opp.ll" %t.other/opt-opp.ll.yaml
++
++; RUN: rm -rf %t.other
++; RUN: opt %s -S -O3  -auto-tuning-opp=%t.other -auto-tuning-type-filter=Other \
++; RUN:     -autotuning-project-dir=%S/
++; RUN: not grep "Name: \+'%S/opt-opp.ll'" %t.other/opt-opp.ll.yaml
++; RUN: grep "Name: \+opt-opp.ll" %t.other/opt-opp.ll.yaml
++
++; UNSUPPORTED: windows
++
++; ModuleID = 'loop-opp.c'
++source_filename = "loop-opp.c"
++target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
++target triple = "x86_64-unknown-linux-gnu"
++
++; Function Attrs: noinline nounwind uwtable
++define i32 @test(i32* %n) #0 !dbg !6 {
++entry:
++  call void @callee(i32 6), !dbg !18
++  %n.addr = alloca i32*, align 8
++  %b = alloca i32, align 4
++  %i = alloca i32, align 4
++  store i32* %n, i32** %n.addr, align 8
++  call void @llvm.dbg.declare(metadata i32** %n.addr, metadata !11, metadata !12), !dbg !13
++  call void @llvm.dbg.declare(metadata i32* %b, metadata !14, metadata !12), !dbg !15
++  store i32 0, i32* %b, align 4, !dbg !15
++  call void @llvm.dbg.declare(metadata i32* %i, metadata !16, metadata !12), !dbg !18
++  store i32 0, i32* %i, align 4, !dbg !18
++  br label %for.cond, !dbg !19
++
++for.cond:                                         ; preds = %for.inc, %entry
++  %0 = load i32, i32* %i, align 4, !dbg !20
++  %1 = load i32*, i32** %n.addr, align 8, !dbg !23
++  %2 = load i32, i32* %1, align 4, !dbg !24
++  %cmp = icmp slt i32 %0, %2, !dbg !25
++  br i1 %cmp, label %for.body, label %for.end, !dbg !26
++
++for.body:                                         ; preds = %for.cond
++  %3 = load i32, i32* %b, align 4, !dbg !28
++  %add = add nsw i32 %3, 1, !dbg !30
++  store i32 %add, i32* %b, align 4, !dbg !31
++  br label %for.inc, !dbg !32
++
++for.inc:                                          ; preds = %for.body
++  %4 = load i32, i32* %i, align 4, !dbg !33
++  %inc = add nsw i32 %4, 1, !dbg !33
++  store i32 %inc, i32* %i, align 4, !dbg !33
++  br label %for.cond, !dbg !35, !llvm.loop !36
++
++for.end:                                          ; preds = %for.cond
++  %5 = load i32, i32* %b, align 4, !dbg !39
++  ret i32 %5, !dbg !40
++}
++
++@a = global i32 4
++define void @callee(i32 %a) #2 {
++entry:
++  %a1 = load volatile i32, i32* @a
++  %x1 = add i32 %a1,  %a1
++  %add = add i32 %x1, %a
++  ret void
++}
++
++; Function Attrs: nounwind readnone
++declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
++
++attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { nounwind readnone }
++attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
++
++!llvm.dbg.cu = !{!0}
++!llvm.module.flags = !{!3, !4}
++!llvm.ident = !{!5}
++
++!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "" ,isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
++!1 = !DIFile(filename: "loop-opp.c", directory: "")
++!2 = !{}
++!3 = !{i32 2, !"Dwarf Version", i32 4}
++!4 = !{i32 2, !"Debug Info Version", i32 3}
++!5 = !{!""}
++!6 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
++!7 = !DISubroutineType(types: !8)
++!8 = !{!9, !10}
++!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
++!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64)
++!11 = !DILocalVariable(name: "n", arg: 1, scope: !6, file: !1, line: 1, type: !10)
++!12 = !DIExpression()
++!13 = !DILocation(line: 1, column: 20, scope: !6)
++!14 = !DILocalVariable(name: "b", scope: !6, file: !1, line: 3, type: !9)
++!15 = !DILocation(line: 3, column: 9, scope: !6)
++!16 = !DILocalVariable(name: "i", scope: !17, file: !1, line: 4, type: !9)
++!17 = distinct !DILexicalBlock(scope: !6, file: !1, line: 4, column: 5)
++!18 = !DILocation(line: 4, column: 14, scope: !17)
++!19 = !DILocation(line: 4, column: 10, scope: !17)
++!20 = !DILocation(line: 4, column: 20, scope: !21)
++!21 = !DILexicalBlockFile(scope: !22, file: !1, discriminator: 1)
++!22 = distinct !DILexicalBlock(scope: !17, file: !1, line: 4, column: 5)
++!23 = !DILocation(line: 4, column: 25, scope: !21)
++!24 = !DILocation(line: 4, column: 24, scope: !21)
++!25 = !DILocation(line: 4, column: 22, scope: !21)
++!26 = !DILocation(line: 4, column: 5, scope: !27)
++!27 = !DILexicalBlockFile(scope: !17, file: !1, discriminator: 1)
++!28 = !DILocation(line: 6, column: 11, scope: !29)
++!29 = distinct !DILexicalBlock(scope: !22, file: !1, line: 5, column: 5)
++!30 = !DILocation(line: 6, column: 12, scope: !29)
++!31 = !DILocation(line: 6, column: 9, scope: !29)
++!32 = !DILocation(line: 7, column: 5, scope: !29)
++!33 = !DILocation(line: 4, column: 28, scope: !34)
++!34 = !DILexicalBlockFile(scope: !22, file: !1, discriminator: 2)
++!35 = !DILocation(line: 4, column: 5, scope: !34)
++!36 = distinct !{!36, !37, !38}
++!37 = !DILocation(line: 4, column: 5, scope: !17)
++!38 = !DILocation(line: 7, column: 5, scope: !17)
++!39 = !DILocation(line: 8, column: 12, scope: !6)
++!40 = !DILocation(line: 8, column: 5, scope: !6)
++
++; DEFAULT: --- !AutoTuning
++; DEFAULT-NEXT: Pass:            loop-unroll
++; DEFAULT-NEXT: Name:            for.cond
++; DEFAULT-NEXT: DebugLoc:        { File: loop-opp.c, Line: 4, Column: 5 }
++; DEFAULT-NEXT: Function:        test
++; DEFAULT-NEXT: CodeRegionType:  loop
++; DEFAULT-NEXT: CodeRegionHash:  {{[0-9]+}}
++; DEFAULT-NEXT: DynamicConfigs:  { UnrollCount: [ {{[0-9]+(, [0-9]+)*}} ] }
++; DEFAULT-NEXT: BaselineConfig:  { UnrollCount: '{{[0-9]+}}' }
++; DEFAULT-NEXT: Invocation:      0
++; DEFAULT-NEXT: ...
++; DEFAULT-NEXT: --- !AutoTuning
++; DEFAULT-NEXT: Pass:            all
++; DEFAULT-NEXT: Name:
++; DEFAULT-SAME: opt-opp.ll
++; DEFAULT-NEXT: Function:        none
++; DEFAULT-NEXT: CodeRegionType:  other
++; COM: Module level hashes can differ based on the filepath so we check a regex
++; DEFAULT-NEXT: CodeRegionHash:  {{[0-9]+}}
++; DEFAULT-NEXT: DynamicConfigs:  { }
++; DEFAULT-NEXT: BaselineConfig:  { }
++; DEFAULT-NEXT: Invocation:      0
++; DEFAULT-NEXT: ...
++
++; LOOP: --- !AutoTuning
++; LOOP-NEXT: Pass:            loop-unroll
++; LOOP-NEXT: Name:            for.cond
++; LOOP-NEXT: DebugLoc:        { File: loop-opp.c, Line: 4, Column: 5 }
++; LOOP-NEXT: Function:        test
++; LOOP-NEXT: CodeRegionType:  loop
++; LOOP-NEXT: CodeRegionHash:  {{[0-9]+}}
++; LOOP-NEXT: DynamicConfigs:  { UnrollCount: [ {{[0-9]+(, [0-9]+)*}} ] }
++; LOOP-NEXT: BaselineConfig:  { UnrollCount: '{{[0-9]+}}' }
++; LOOP-NEXT: Invocation:      0
++; LOOP-NEXT: ...
++
++; CALLSITE: --- !AutoTuning
++; CALLSITE-NEXT: Pass:            inline
++; CALLSITE-NEXT: Name:            callee
++; CALLSITE-NEXT: DebugLoc:        { File: loop-opp.c, Line: 4, Column: 14 }
++; CALLSITE-NEXT: Function:        test
++; CALLSITE-NEXT: CodeRegionType:  callsite
++; CALLSITE-NEXT: CodeRegionHash:  {{[0-9]+}}
++; CALLSITE-NEXT: DynamicConfigs:  { ForceInline: [ 0, 1 ] }
++; CALLSITE-NEXT: BaselineConfig:  { ForceInline: '1' }
++; CALLSITE-NEXT: Invocation:      0
++; CALLSITE-NEXT: ...
++
++; CALLSITE-LOOP1: CodeRegionType:  loop
++; CALLSITE-LOOP1-NOT: CodeRegionType:  other
++; CALLSITE-LOOP2: CodeRegionType:  callsite
++; CALLSITE-LOOP2-NOT: CodeRegionType:  other
++
++; OTHER: --- !AutoTuning
++; OTHER-NEXT: Pass:            all
++; OTHER-NEXT: Name:
++; OTHER-SAME: opt-opp.ll
++; OTHER-NEXT: Function:        none
++; OTHER-NEXT: CodeRegionType:  other
++; COM: Module level hashes can differ based on the filepath so we check a regex
++; OTHER-NEXT: CodeRegionHash:  {{[0-9]+}}
++; OTHER-NEXT: DynamicConfigs:  { }
++; OTHER-NEXT: BaselineConfig:  { }
++; OTHER-NEXT: Invocation:      0
++; OTHER-NEXT: ...
++
++; LLVMPARAM: --- !AutoTuning
++; LLVMPARAM-NEXT: Pass:            none
++; LLVMPARAM-NEXT: Name:
++; LLVMPARAM-SAME: opt-opp.ll
++; LLVMPARAM-NEXT: Function:        none
++; LLVMPARAM-NEXT: CodeRegionType:  llvm-param
++; LLVMPARAM-NEXT: CodeRegionHash:  {{[0-9]+}}
++; LLVMPARAM-NEXT: DynamicConfigs:  { }
++; LLVMPARAM-NEXT: BaselineConfig:  { }
++; LLVMPARAM-NEXT: Invocation:      0
++; LLVMPARAM-NEXT: ...
++
++; ProgramPARAM: --- !AutoTuning
++; ProgramPARAM-NEXT: Pass:            none
++; ProgramPARAM-NEXT: Name:
++; ProgramPARAM-SAME: opt-opp.ll
++; ProgramPARAM-NEXT: Function:        none
++; ProgramPARAM-NEXT: CodeRegionType:  program-param
++; ProgramPARAM-NEXT: CodeRegionHash:  {{[0-9]+}}
++; ProgramPARAM-NEXT: DynamicConfigs:  { }
++; ProgramPARAM-NEXT: BaselineConfig:  { }
++; ProgramPARAM-NEXT: Invocation:      0
++; ProgramPARAM-NEXT: ...
++
++; SIZE-LOOP:  PassName: loop-unroll
++; SIZE-LOOP-NEXT:  Type: loop
++; SIZE-LOOP-NEXT:  Size: 13
++; SIZE-LOOP:  Module added as an tuning opportunity
++
++; SIZE-LOOP-OPP-DAG: Pass:            loop-unroll
++; SIZE-LOOP-OPP-DAG: Pass:            all
++
++; SIZE-LOOP-FILTERED-NOT:  PassName: loop-unroll
++; SIZE-LOOP-FILTERED:  Module added as an tuning opportunity
++
++; SIZE-LOOP-OPP-FILTERED-NOT: Pass:            loop-unroll
++; Ths "other" code regions should remain as-is.
++; SIZE-LOOP-OPP-FILTERED: CodeRegionType:  other
++
++; SIZE-CALLSITE:  PassName: inline
++; SIZE-CALLSITE-NEXT:  Type: callsite
++; SIZE-CALLSITE-NEXT:  Size: 4
++; SIZE-CALLSITE:  Module added as an tuning opportunity
++
++; SIZE-CALLSITE-OPP-DAG: Pass:            inline
++; SIZE-CALLSITE-OPP-DAG: Pass:            all
++
++; SIZE-CALLSITE-FILTERED-NOT:  PassName: inline
++; SIZE-CALLSITE-FILTERED:  Module added as an tuning opportunity
++
++; SIZE-CALLSITE-OPP-FILTERED-NOT: Pass:            inline
++; Ths "other" code regions should remain as-is.
++; SIZE-CALLSITE-OPP-FILTERED: CodeRegionType:  other
+diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
+index fc7ab6536309..0e9396e3b014 100644
+--- a/llvm/test/lit.site.cfg.py.in
++++ b/llvm/test/lit.site.cfg.py.in
+@@ -62,6 +62,7 @@ config.reverse_iteration = @LLVM_ENABLE_REVERSE_ITERATION@
+ config.dxil_tests = @LLVM_INCLUDE_DXIL_TESTS@
+ config.have_llvm_driver = @LLVM_TOOL_LLVM_DRIVER_BUILD@
+ config.use_classic_flang = @LLVM_ENABLE_CLASSIC_FLANG@
++config.enable_enable_autotuner = @LLVM_ENABLE_AUTOTUNER@
+ 
+ import lit.llvm
+ lit.llvm.initialize(lit_config, config)
+diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
+index 8934130f9913..94b2028b25bc 100644
+--- a/llvm/tools/llc/llc.cpp
++++ b/llvm/tools/llc/llc.cpp
+@@ -645,6 +645,18 @@ static int compileModule(char **argv, LLVMContext &Context) {
+       reportError(EC.message(), SplitDwarfOutputFile);
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (llvm::Error E = autotuning::Engine.init(M->getModuleIdentifier())) {
++    errs() << "error: " << toString(std::move(E)) << '\n';
++    return 1;
++  }
++  if (autotuning::Engine.isEnabled() && autotuning::Engine.isParseInput() &&
++      (autotuning::Engine.LLVMParams.size() ||
++       autotuning::Engine.ProgramParams.size()))
++    llvm::cl::ParseAutoTunerOptions(autotuning::Engine.LLVMParams,
++                                    autotuning::Engine.ProgramParams);
++#endif
++
+   // Build up all of the passes that we want to do to the module.
+   legacy::PassManager PM;
+ 
+@@ -776,6 +788,13 @@ static int compileModule(char **argv, LLVMContext &Context) {
+     }
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  if (llvm::Error E = autotuning::Engine.finalize()) {
++    errs() << "error: " << toString(std::move(E)) << '\n';
++    return 1;
++  }
++#endif
++
+   // Declare success.
+   Out->keep();
+   if (DwoOut)
+diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
+index 6ae3f87099af..5ce9e4fee81f 100644
+--- a/llvm/tools/opt/NewPMDriver.cpp
++++ b/llvm/tools/opt/NewPMDriver.cpp
+@@ -39,6 +39,10 @@
+ #include "llvm/Transforms/Scalar/LoopPassManager.h"
+ #include "llvm/Transforms/Utils/Debugify.h"
+ 
++#if defined(ENABLE_AUTOTUNER)
++#include "llvm/AutoTuner/AutoTuning.h"
++#endif
++
+ using namespace llvm;
+ using namespace opt_tool;
+ 
+@@ -459,6 +463,35 @@ bool llvm::runPassPipeline(
+     MPM.addPass(NewPMDebugifyPass(DebugifyMode::OriginalDebugInfo, "",
+                                   &DebugInfoBeforePass));
+ 
++#if defined(ENABLE_AUTOTUNER)
++  bool Changed = false;
++  // If autotuning is enabled (for applying configuration), use AutoTuner
++  // generated pass ordering instead of using passes specified with -passes=...
++  // with opt tool.
++  if (autotuning::Engine.isEnabled()) {
++    std::vector<std::string> PassesList;
++    Changed = autotuning::Engine.lookUpGlobalParams("OptPass", PassesList);
++    if (Changed && PassesList.size()) {
++      std::string PassPipeline = "";
++      for (auto PassName : PassesList)
++        PassPipeline.append(PassName + ",");
++      PassPipeline.pop_back();
++
++      if (auto Err = PB.parsePassPipeline(MPM, PassPipeline))
++        errs() << "AutoTuner: cannot add pass:" << toString(std::move(Err))
++               << "\n";
++    }
++  }
++  if (!Changed) {
++    // Add passes according to the -passes options.
++    if (!PassPipeline.empty()) {
++      if (auto Err = PB.parsePassPipeline(MPM, PassPipeline)) {
++        errs() << Arg0 << ": " << toString(std::move(Err)) << "\n";
++        return false;
++      }
++    }
++  }
++#else
+   // Add passes according to the -passes options.
+   if (!PassPipeline.empty()) {
+     if (auto Err = PB.parsePassPipeline(MPM, PassPipeline)) {
+@@ -466,6 +499,7 @@ bool llvm::runPassPipeline(
+       return false;
+     }
+   }
++#endif
+ 
+   if (VK > VK_NoVerifier)
+     MPM.addPass(VerifierPass());
+@@ -539,6 +573,14 @@ bool llvm::runPassPipeline(
+   if (DebugifyEach && !DebugifyExport.empty())
+     exportDebugifyStats(DebugifyExport, Debugify.getDebugifyStatsMap());
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // AUTO-TUNING - auto-tuning finalization for this module
++  if (Error E = autotuning::Engine.finalize()) {
++    errs() << "error: " << toString(std::move(E)) << '\n';
++    return false;
++  }
++#endif
++
+   return true;
+ }
+ 
+diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
+index 9c20e7784223..1401352647cd 100644
+--- a/llvm/tools/opt/opt.cpp
++++ b/llvm/tools/opt/opt.cpp
+@@ -456,6 +456,9 @@ int main(int argc, char **argv) {
+   initializeWriteBitcodePassPass(Registry);
+   initializeReplaceWithVeclibLegacyPass(Registry);
+   initializeJMCInstrumenterPass(Registry);
++#if defined(ENABLE_AUTOTUNER)
++  initializeAutotuningDumpLegacyPass(Registry);
++#endif
+ 
+   SmallVector<PassPlugin, 1> PluginList;
+   PassPlugins.setCallback([&](const std::string &PluginPath) {
+@@ -516,7 +519,11 @@ int main(int argc, char **argv) {
+                                    RemarksFormat, RemarksWithHotness,
+                                    RemarksHotnessThreshold);
+   if (Error E = RemarksFileOrErr.takeError()) {
++#if defined(ENABLE_AUTOTUNER)
++    errs() << "error: " << toString(std::move(E)) << '\n';
++#else
+     errs() << toString(std::move(E)) << '\n';
++#endif
+     return 1;
+   }
+   std::unique_ptr<ToolOutputFile> RemarksFile = std::move(*RemarksFileOrErr);
+@@ -641,6 +648,20 @@ int main(int argc, char **argv) {
+       M->addModuleFlag(Module::Error, "UnifiedLTO", 1);
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // AUTO-TUNING - auto-tuning initialization for this module
++  // if the auto-tuning flag is on
++  if (Error E = autotuning::Engine.init(M->getModuleIdentifier())) {
++    errs() << "error: " << toString(std::move(E)) << '\n';
++    return 1;
++  }
++  if (autotuning::Engine.isEnabled() && autotuning::Engine.isParseInput() &&
++      (autotuning::Engine.LLVMParams.size() ||
++       autotuning::Engine.ProgramParams.size()))
++    llvm::cl::ParseAutoTunerOptions(autotuning::Engine.LLVMParams,
++                                    autotuning::Engine.ProgramParams);
++#endif
++
+   // Add an appropriate TargetLibraryInfo pass for the module's triple.
+   TargetLibraryInfoImpl TLII(ModuleTriple);
+ 
+@@ -778,6 +799,30 @@ int main(int argc, char **argv) {
+     Passes.add(TPC);
+   }
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // AUTO-TUNING - If auto-tuning is enabled, try to generate passes
++  // from auto-tuning interface and disable all optimization passes.
++  if (autotuning::Engine.isEnabled()) {
++    std::vector<std::string> PassesList;
++    bool Changed = autotuning::Engine.lookUpGlobalParams("OptPass", PassesList);
++    if (Changed) {
++      // disable all optimization passes of all optimization levels
++      OptLevelO0 = false;
++      OptLevelO1 = false;
++      OptLevelO2 = false;
++      OptLevelOs = false;
++      OptLevelOz = false;
++      OptLevelO3 = false;
++      for (auto const &Value : PassesList) {
++        const PassInfo *PassInf = (Registry.getPassInfo(StringRef(Value)));
++        if (PassInf) {
++          PassList.push_back(PassInf);
++        }
++      }
++    }
++  }
++#endif
++
+   // Create a new optimization pass for each one specified on the command line
+   for (unsigned i = 0; i < PassList.size(); ++i) {
+     const PassInfo *PassInf = PassList[i];
+@@ -878,6 +923,14 @@ int main(int argc, char **argv) {
+   if (DebugifyEach && !DebugifyExport.empty())
+     exportDebugifyStats(DebugifyExport, Passes.getDebugifyStatsMap());
+ 
++#if defined(ENABLE_AUTOTUNER)
++  // AUTO-TUNING - auto-tuning finalization for this module
++  if (Error E = autotuning::Engine.finalize()) {
++    errs() << "error: " << toString(std::move(E)) << '\n';
++    return 1;
++  }
++#endif
++
+   // Declare success.
+   if (!NoOutput)
+     Out->keep();
+-- 
+2.33.0
+
diff --git a/0022-Prevent-environment-variables-from-exceeding-NAME_MA.patch b/0022-Prevent-environment-variables-from-exceeding-NAME_MA.patch
new file mode 100644
index 0000000..35043e2
--- /dev/null
+++ b/0022-Prevent-environment-variables-from-exceeding-NAME_MA.patch
@@ -0,0 +1,28 @@
+From 4673c2eaba443678c4dc6ae74ea16a489b415fed Mon Sep 17 00:00:00 2001
+From: liyunfei <liyunfei33@huawei.com>
+Date: Tue, 19 Sep 2023 09:31:43 +0800
+Subject: [PATCH] Prevent environment variables from exceeding NAME_MAX
+
+---
+ llvm/lib/Support/Unix/Path.inc | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
+index 2ae7c6dc..f13f3165 100644
+--- a/llvm/lib/Support/Unix/Path.inc
++++ b/llvm/lib/Support/Unix/Path.inc
+@@ -1427,8 +1427,12 @@ static const char *getEnvTempDir() {
+   // variable.
+   const char *EnvironmentVariables[] = {"TMPDIR", "TMP", "TEMP", "TEMPDIR"};
+   for (const char *Env : EnvironmentVariables) {
+-    if (const char *Dir = std::getenv(Env))
++    if (const char *Dir = std::getenv(Env)) {
++      if(std::strlen(Dir) > NAME_MAX) {
++        continue;
++      }
+       return Dir;
++    }
+   }
+ 
+   return nullptr;
+-- 
\ No newline at end of file
diff --git a/0023-AArch64-Support-HiSilicon-s-HIP09-Processor.patch b/0023-AArch64-Support-HiSilicon-s-HIP09-Processor.patch
new file mode 100644
index 0000000..d759ab1
--- /dev/null
+++ b/0023-AArch64-Support-HiSilicon-s-HIP09-Processor.patch
@@ -0,0 +1,517 @@
+From cac43828d26b178807d194b4bd7c5df69603df29 Mon Sep 17 00:00:00 2001
+From: xiajingze <xiajingze1@huawei.com>
+Date: Wed, 31 Jul 2024 18:37:29 +0800
+Subject: [PATCH] [AArch64] Support HiSilicon's HIP09 Processor
+
+Signed-off-by: xiajingze <xiajingze1@huawei.com>
+---
+ llvm/cmake/modules/HandleLLVMOptions.cmake    |  8 ++
+ .../llvm/TargetParser/AArch64TargetParser.h   |  7 ++
+ llvm/lib/Target/AArch64/AArch64.td            | 36 +++++++
+ .../lib/Target/AArch64/AArch64MacroFusion.cpp | 55 +++++++++++
+ llvm/lib/Target/AArch64/AArch64Subtarget.cpp  |  9 ++
+ llvm/lib/Target/AArch64/AArch64Subtarget.h    |  9 +-
+ llvm/lib/Target/CMakeLists.txt                |  4 +
+ llvm/lib/TargetParser/Host.cpp                |  3 +
+ llvm/test/CodeGen/AArch64/cpus-hip09.ll       | 11 +++
+ .../CodeGen/AArch64/macro-fusion-mvnclz.mir   | 20 ++++
+ .../AArch64/misched-fusion-lit-hip09.ll       | 73 ++++++++++++++
+ llvm/test/CodeGen/AArch64/remat-hip09.ll      | 18 ++++
+ llvm/test/lit.site.cfg.py.in                  |  4 +
+ llvm/unittests/TargetParser/Host.cpp          |  5 +
+ .../TargetParser/TargetParserTest.cpp         | 16 +++
+ 15 files changed, 277 insertions(+), 1 deletion(-)
+ create mode 100644 llvm/test/CodeGen/AArch64/cpus-hip09.ll
+ create mode 100644 llvm/test/CodeGen/AArch64/macro-fusion-mvnclz.mir
+ create mode 100644 llvm/test/CodeGen/AArch64/misched-fusion-lit-hip09.ll
+ create mode 100644 llvm/test/CodeGen/AArch64/remat-hip09.ll
+
+diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
+index 8be5d4ba5..74e68e25d 100644
+--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
++++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
+@@ -112,6 +112,14 @@ else()
+   set(LLVM_ENABLE_AUTOTUNER 0)
+ endif()
+ 
++option(LLVM_ENABLE_AARCH64_HIP09 "Enable HIP09 Processor" ON)
++if(LLVM_ENABLE_AARCH64_HIP09)
++  set(LLVM_ENABLE_AARCH64_HIP09 1)
++  add_definitions( -DENABLE_AARCH64_HIP09 )
++else()
++  set(LLVM_ENABLE_AARCH64_HIP09 0)
++endif()
++
+ if(LLVM_ENABLE_EXPENSIVE_CHECKS)
+   add_compile_definitions(EXPENSIVE_CHECKS)
+ 
+diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+index dc4cdfa8e..07cd2fcbb 100644
+--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
++++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+@@ -542,6 +542,13 @@ inline constexpr CpuInfo CpuInfos[] = {
+      (AArch64::AEK_FP16 | AArch64::AEK_RAND | AArch64::AEK_SM4 |
+       AArch64::AEK_SHA3 | AArch64::AEK_SHA2 | AArch64::AEK_AES |
+       AArch64::AEK_MTE | AArch64::AEK_SB | AArch64::AEK_SSBS)},
++#if defined(ENABLE_AARCH64_HIP09)
++    {"hip09", ARMV8_5A,
++     (AArch64::AEK_AES | AArch64::AEK_SM4 | AArch64::AEK_SHA2 |
++      AArch64::AEK_SHA3 | AArch64::AEK_FP16 | AArch64::AEK_PROFILE |
++      AArch64::AEK_FP16FML | AArch64::AEK_SVE | AArch64::AEK_I8MM |
++      AArch64::AEK_F32MM | AArch64::AEK_F64MM | AArch64::AEK_BF16)},
++#endif
+ };
+ 
+ // An alias for a CPU.
+diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
+index 8f50af4b7..c8bfd770f 100644
+--- a/llvm/lib/Target/AArch64/AArch64.td
++++ b/llvm/lib/Target/AArch64/AArch64.td
+@@ -296,6 +296,12 @@ def FeatureFuseAddSub2RegAndConstOne : SubtargetFeature<
+    "fuse-addsub-2reg-const1", "HasFuseAddSub2RegAndConstOne", "true",
+    "CPU fuses (a + b + 1) and (a - b - 1)">;
+ 
++#ifdef ENABLE_AARCH64_HIP09
++def FeatureFuseMvnClz : SubtargetFeature<
++    "fuse-mvn-clz", "HasFuseMvnClz", "true",
++    "CPU fuses mvn+clz operations">;
++#endif
++
+ def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
+     "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
+     "Disable latency scheduling heuristic">;
+@@ -1205,6 +1211,21 @@ def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110",
+                                   FeatureFuseAES,
+                                   FeaturePostRAScheduler]>;
+ 
++#ifdef ENABLE_AARCH64_HIP09
++def TuneHIP09 : SubtargetFeature<"hip09", "ARMProcFamily", "HIP09",
++                                   "HiSilicon HIP-09 processors", [
++                                   FeatureCustomCheapAsMoveHandling,
++                                   FeatureExperimentalZeroingPseudos,
++                                   FeatureFuseAES,
++                                   FeatureLSLFast,
++                                   FeatureAscendStoreAddress,
++                                   FeatureCmpBccFusion,
++                                   FeatureArithmeticBccFusion,
++                                   FeatureFuseLiterals,
++                                   FeatureFuseMvnClz,
++                                   FeaturePostRAScheduler]>;
++#endif
++
+ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
+                                    "Ampere Computing Ampere-1 processors", [
+                                    FeaturePostRAScheduler,
+@@ -1359,6 +1380,14 @@ def ProcessorFeatures {
+   list<SubtargetFeature> TSV110 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+                                    FeatureNEON, FeaturePerfMon, FeatureSPE,
+                                    FeatureFullFP16, FeatureFP16FML, FeatureDotProd];
++#ifdef ENABLE_AARCH64_HIP09
++  list<SubtargetFeature> HIP09 = [HasV8_5aOps, FeatureBF16, FeatureCrypto, FeatureFPARMv8,
++                                  FeatureMatMulInt8, FeatureMatMulFP32, FeatureMatMulFP64,
++                                  FeatureNEON, FeaturePerfMon, FeatureRandGen, FeatureSPE,
++                                  FeatureFullFP16, FeatureFP16FML, FeatureDotProd,
++                                  FeatureJS, FeatureComplxNum, FeatureSHA3, FeatureSM4,
++                                  FeatureSVE];
++#endif
+   list<SubtargetFeature> Ampere1 = [HasV8_6aOps, FeatureNEON, FeaturePerfMon,
+                                     FeatureSSBS, FeatureRandGen, FeatureSB,
+                                     FeatureSHA2, FeatureSHA3, FeatureAES];
+@@ -1464,8 +1493,15 @@ def : ProcessorModel<"thunderx2t99", ThunderX2T99Model,
+ // Marvell ThunderX3T110 Processors.
+ def : ProcessorModel<"thunderx3t110", ThunderX3T110Model,
+                      ProcessorFeatures.ThunderX3T110, [TuneThunderX3T110]>;
++
++// HiSilicon Processors.
+ def : ProcessorModel<"tsv110", TSV110Model, ProcessorFeatures.TSV110,
+                      [TuneTSV110]>;
++#ifdef ENABLE_AARCH64_HIP09
++// FIXME: HiSilicon HIP09 is currently modeled as a Cortex-A57.
++def : ProcessorModel<"hip09", CortexA57Model, ProcessorFeatures.HIP09,
++                     [TuneHIP09]>;
++#endif
+ 
+ // Support cyclone as an alias for apple-a7 so we can still LTO old bitcode.
+ def : ProcessorModel<"cyclone", CycloneModel, ProcessorFeatures.AppleA7,
+diff --git a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
+index 05d60872b..4963ec350 100644
+--- a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
++++ b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
+@@ -51,6 +51,12 @@ static bool isArithmeticBccPair(const MachineInstr *FirstMI,
+   case AArch64::SUBSXrr:
+   case AArch64::BICSWrr:
+   case AArch64::BICSXrr:
++#if defined(ENABLE_AARCH64_HIP09)
++  case AArch64::ADCSWr:
++  case AArch64::ADCSXr:
++  case AArch64::SBCSWr:
++  case AArch64::SBCSXr:
++#endif
+     return true;
+   case AArch64::ADDSWrs:
+   case AArch64::ADDSXrs:
+@@ -183,6 +189,20 @@ static bool isLiteralsPair(const MachineInstr *FirstMI,
+       SecondMI.getOperand(3).getImm() == 16))
+     return true;
+ 
++#if defined(ENABLE_AARCH64_HIP09)
++  // 32 bit immediate.
++  if ((FirstMI == nullptr || FirstMI->getOpcode() == AArch64::MOVNWi) &&
++      (SecondMI.getOpcode() == AArch64::MOVKWi &&
++       SecondMI.getOperand(3).getImm() == 16))
++    return true;
++
++  // Lower half of 64 bit immediate.
++  if ((FirstMI == nullptr || FirstMI->getOpcode() == AArch64::MOVNXi) &&
++      (SecondMI.getOpcode() == AArch64::MOVKWi &&
++       SecondMI.getOperand(3).getImm() == 16))
++    return true;
++#endif
++
+   // Upper half of 64 bit immediate.
+   if ((FirstMI == nullptr ||
+        (FirstMI->getOpcode() == AArch64::MOVKXi &&
+@@ -437,6 +457,37 @@ static bool isAddSub2RegAndConstOnePair(const MachineInstr *FirstMI,
+   return false;
+ }
+ 
++#if defined(ENABLE_AARCH64_HIP09)
++static bool isMvnClzPair(const MachineInstr *FirstMI,
++                         const MachineInstr &SecondMI) {
++  // HIP09 supports fusion of MVN + CLZ.
++  // The CLZ can be fused with MVN and make execution faster.
++  // And the fusion is not allowed for shifted forms.
++  //
++  // Instruction alias info:
++  // 1. MVN <Wd>, <Wm>{, <shift> #<amount>} is equivalent to
++  //    ORN <Wd>, WZR, <Wm>{, <shift> #<amount>}
++  // 2. MVN <Xd>, <Xm>{, <shift> #<amount>} is equivalent to
++  //    ORN <Xd>, XZR, <Xm>{, <shift> #<amount>}
++  // Assume the 1st instr to be a wildcard if it is unspecified.
++  if ((FirstMI == nullptr ||
++       ((FirstMI->getOpcode() == AArch64::ORNWrs) &&
++        (FirstMI->getOperand(1).getReg() == AArch64::WZR) &&
++        (!AArch64InstrInfo::hasShiftedReg(*FirstMI)))) &&
++      (SecondMI.getOpcode() == AArch64::CLZWr))
++    return true;
++
++  if ((FirstMI == nullptr ||
++       ((FirstMI->getOpcode() == AArch64::ORNXrs) &&
++        (FirstMI->getOperand(1).getReg() == AArch64::XZR) &&
++        (!AArch64InstrInfo::hasShiftedReg(*FirstMI)))) &&
++      (SecondMI.getOpcode() == AArch64::CLZXr))
++    return true;
++
++  return false;
++}
++#endif
++
+ /// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+ /// together. Given SecondMI, when FirstMI is unspecified, then check if
+ /// SecondMI may be part of a fused pair at all.
+@@ -472,6 +523,10 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+   if (ST.hasFuseAddSub2RegAndConstOne() &&
+       isAddSub2RegAndConstOnePair(FirstMI, SecondMI))
+     return true;
++#if defined(ENABLE_AARCH64_HIP09)
++  if (ST.hasFuseMvnClz() && isMvnClzPair(FirstMI, SecondMI))
++    return true;
++#endif
+ 
+   return false;
+ }
+diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+index 450e27b8a..ddf22364c 100644
+--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
++++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+@@ -266,6 +266,15 @@ void AArch64Subtarget::initializeProperties() {
+     PrefFunctionAlignment = Align(16);
+     PrefLoopAlignment = Align(4);
+     break;
++#if defined(ENABLE_AARCH64_HIP09)
++  case HIP09:
++    CacheLineSize = 64;
++    PrefFunctionAlignment = Align(16);
++    PrefLoopAlignment = Align(4);
++    VScaleForTuning = 2;
++    DefaultSVETFOpts = TailFoldingOpts::Simple;
++    break;
++#endif
+   case ThunderX3T110:
+     CacheLineSize = 64;
+     PrefFunctionAlignment = Align(16);
+diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
+index 5e20d1646..5f481f4f9 100644
+--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
++++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
+@@ -87,7 +87,10 @@ public:
+     ThunderXT83,
+     ThunderXT88,
+     ThunderX3T110,
+-    TSV110
++    TSV110,
++#if defined(ENABLE_AARCH64_HIP09)
++    HIP09
++#endif
+   };
+ 
+ protected:
+@@ -239,7 +242,11 @@ public:
+   bool hasFusion() const {
+     return hasArithmeticBccFusion() || hasArithmeticCbzFusion() ||
+            hasFuseAES() || hasFuseArithmeticLogic() || hasFuseCCSelect() ||
++#if defined(ENABLE_AARCH64_HIP09)
++           hasFuseAdrpAdd() || hasFuseLiterals() || hasFuseMvnClz();
++#else
+            hasFuseAdrpAdd() || hasFuseLiterals();
++#endif
+   }
+ 
+   unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
+diff --git a/llvm/lib/Target/CMakeLists.txt b/llvm/lib/Target/CMakeLists.txt
+index 2739233f9..501ce1f2f 100644
+--- a/llvm/lib/Target/CMakeLists.txt
++++ b/llvm/lib/Target/CMakeLists.txt
+@@ -2,6 +2,10 @@ list(APPEND LLVM_COMMON_DEPENDS intrinsics_gen)
+ 
+ list(APPEND LLVM_TABLEGEN_FLAGS -I ${LLVM_MAIN_SRC_DIR}/lib/Target)
+ 
++if(LLVM_ENABLE_AARCH64_HIP09)
++  list(APPEND LLVM_TABLEGEN_FLAGS "-DENABLE_AARCH64_HIP09")
++endif()
++
+ add_llvm_component_library(LLVMTarget
+   Target.cpp
+   TargetIntrinsicInfo.cpp
+diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
+index d11dc605e..8b23be02e 100644
+--- a/llvm/lib/TargetParser/Host.cpp
++++ b/llvm/lib/TargetParser/Host.cpp
+@@ -257,6 +257,9 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
+     // contents are specified in the various processor manuals.
+     return StringSwitch<const char *>(Part)
+       .Case("0xd01", "tsv110")
++#if defined(ENABLE_AARCH64_HIP09)
++      .Case("0xd02", "hip09")
++#endif
+       .Default("generic");
+ 
+   if (Implementer == "0x51") // Qualcomm Technologies, Inc.
+diff --git a/llvm/test/CodeGen/AArch64/cpus-hip09.ll b/llvm/test/CodeGen/AArch64/cpus-hip09.ll
+new file mode 100644
+index 000000000..dcf32e4dc
+--- /dev/null
++++ b/llvm/test/CodeGen/AArch64/cpus-hip09.ll
+@@ -0,0 +1,11 @@
++; REQUIRES: enable_enable_aarch64_hip09
++; This tests that llc accepts all valid AArch64 CPUs
++
++; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=hip09 2>&1 | FileCheck %s
++
++; CHECK-NOT: {{.*}}  is not a recognized processor for this target
++; INVALID: {{.*}}  is not a recognized processor for this target
++
++define i32 @f(i64 %z) {
++	ret i32 0
++}
+diff --git a/llvm/test/CodeGen/AArch64/macro-fusion-mvnclz.mir b/llvm/test/CodeGen/AArch64/macro-fusion-mvnclz.mir
+new file mode 100644
+index 000000000..64bf15937
+--- /dev/null
++++ b/llvm/test/CodeGen/AArch64/macro-fusion-mvnclz.mir
+@@ -0,0 +1,20 @@
++# REQUIRES: enable_enable_aarch64_hip09
++# RUN: llc -o - %s -mtriple=aarch64-- -mattr=+fuse-mvn-clz -run-pass postmisched | FileCheck %s --check-prefixes=CHECK,FUSION
++# RUN: llc -o - %s -mtriple=aarch64-- -mattr=-fuse-mvn-clz -run-pass postmisched | FileCheck %s --check-prefixes=CHECK,NOFUSION
++---
++# CHECK-LABEL: name: fuse-mvn-clz
++# CHECK: $w2 = ORNWrs $wzr, $w1, 0
++# FUSION: $w0 = CLZWr killed renamable $w2
++# CHECK: $w3 = ADDWri killed renamable $w1, 1, 0
++# NOFUSION: $w0 = CLZWr killed renamable $w2
++name: fuse-mvn-clz
++tracksRegLiveness: true
++body: |
++  bb.0:
++    liveins: $w0, $w1, $w2, $w3
++
++    $w2 = ORNWrs $wzr, $w1, 0
++    $w3 = ADDWri killed renamable $w1, 1, 0 
++    $w0 = CLZWr killed renamable $w2
++    RET undef $lr, implicit $w0
++...
+diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-lit-hip09.ll b/llvm/test/CodeGen/AArch64/misched-fusion-lit-hip09.ll
+new file mode 100644
+index 000000000..d67fa5b43
+--- /dev/null
++++ b/llvm/test/CodeGen/AArch64/misched-fusion-lit-hip09.ll
+@@ -0,0 +1,73 @@
++; REQUIRES: enable_enable_aarch64_hip09
++; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=hip09           | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSE-HIP09
++
++@g = common local_unnamed_addr global ptr null, align 8
++
++define dso_local ptr @litp(i32 %a, i32 %b) {
++entry:
++  %add = add nsw i32 %b, %a
++  %idx.ext = sext i32 %add to i64
++  %add.ptr = getelementptr i8, ptr @litp, i64 %idx.ext
++  store ptr %add.ptr, ptr @g, align 8
++  ret ptr %add.ptr
++
++; CHECK-LABEL: litp:
++; CHECK: adrp [[R:x[0-9]+]], litp
++; CHECKFUSE-NEXT: add {{x[0-9]+}}, [[R]], :lo12:litp
++}
++
++define dso_local ptr @litp_tune_generic(i32 %a, i32 %b) "tune-cpu"="generic" {
++entry:
++  %add = add nsw i32 %b, %a
++  %idx.ext = sext i32 %add to i64
++  %add.ptr = getelementptr i8, ptr @litp_tune_generic, i64 %idx.ext
++  store ptr %add.ptr, ptr @g, align 8
++  ret ptr %add.ptr
++
++; CHECK-LABEL: litp_tune_generic:
++; CHECK:         adrp [[R:x[0-9]+]], litp_tune_generic
++; CHECK-NEXT:    add {{x[0-9]+}}, [[R]], :lo12:litp_tune_generic
++}
++
++define dso_local i32 @liti(i32 %a, i32 %b) {
++entry:
++  %add = add i32 %a, -262095121
++  %add1 = add i32 %add, %b
++  ret i32 %add1
++
++; CHECK-LABEL: liti:
++; CHECK: mov [[R:w[0-9]+]], {{#[0-9]+}}
++; CHECKDONT-NEXT: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
++; CHECKFUSE-NEXT: movk [[R]], {{#[0-9]+}}, lsl #16
++; CHECKFUSE-HIP09: movk [[R]], {{#[0-9]+}}, lsl #16
++}
++
++; Function Attrs: norecurse nounwind readnone
++define dso_local i64 @litl(i64 %a, i64 %b) {
++entry:
++  %add = add i64 %a, 2208998440489107183
++  %add1 = add i64 %add, %b
++  ret i64 %add1
++
++; CHECK-LABEL: litl:
++; CHECK: mov [[R:x[0-9]+]], {{#[0-9]+}}
++; CHECKDONT-NEXT: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
++; CHECK-NEXT: movk [[R]], {{#[0-9]+}}, lsl #16
++; CHECK: movk [[R]], {{#[0-9]+}}, lsl #32
++; CHECK-NEXT: movk [[R]], {{#[0-9]+}}, lsl #48
++}
++
++; Function Attrs: norecurse nounwind readnone
++define dso_local double @litf() {
++entry:
++  ret double 0x400921FB54442D18
++
++; CHECK-LABEL: litf:
++; CHECK-DONT:      adrp [[ADDR:x[0-9]+]], [[CSTLABEL:.LCP.*]]
++; CHECK-DONT-NEXT: ldr  {{d[0-9]+}}, {{[[]}}[[ADDR]], :lo12:[[CSTLABEL]]{{[]]}}
++; CHECKFUSE-HIP09:    mov  [[R:x[0-9]+]], #11544
++; CHECKFUSE-HIP09:    movk [[R]], #21572, lsl #16
++; CHECKFUSE-HIP09:    movk [[R]], #8699, lsl #32
++; CHECKFUSE-HIP09:    movk [[R]], #16393, lsl #48
++; CHECKFUSE-HIP09:    fmov {{d[0-9]+}}, [[R]]
++}
+diff --git a/llvm/test/CodeGen/AArch64/remat-hip09.ll b/llvm/test/CodeGen/AArch64/remat-hip09.ll
+new file mode 100644
+index 000000000..aec0d18ae
+--- /dev/null
++++ b/llvm/test/CodeGen/AArch64/remat-hip09.ll
+@@ -0,0 +1,18 @@
++; REQUIRES: enable_enable_aarch64_hip09
++; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=hip09 -o - %s | FileCheck %s
++
++%X = type { i64, i64, i64 }
++declare void @f(ptr)
++define void @t() {
++entry:
++  %tmp = alloca %X
++  call void @f(ptr %tmp)
++; CHECK: add x0, sp, #8
++; CHECK-NOT: mov
++; CHECK-NEXT: bl f
++  call void @f(ptr %tmp)
++; CHECK: add x0, sp, #8
++; CHECK-NOT: mov
++; CHECK-NEXT: bl f
++  ret void
++}
+diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
+index 20c1ecca1..6145a514f 100644
+--- a/llvm/test/lit.site.cfg.py.in
++++ b/llvm/test/lit.site.cfg.py.in
+@@ -64,9 +64,13 @@ config.have_llvm_driver = @LLVM_TOOL_LLVM_DRIVER_BUILD@
+ config.use_classic_flang = @LLVM_ENABLE_CLASSIC_FLANG@
+ config.enable_enable_autotuner = @LLVM_ENABLE_AUTOTUNER@
++config.enable_enable_aarch64_hip09 = @LLVM_ENABLE_AARCH64_HIP09@
+ 
+ import lit.llvm
+ lit.llvm.initialize(lit_config, config)
+ 
++if config.enable_enable_aarch64_hip09:
++    config.available_features.add("enable_enable_aarch64_hip09")
++
+ # Let the main config do the real work.
+ lit_config.load_config(
+     config, os.path.join(config.llvm_src_root, "test/lit.cfg.py"))
+diff --git a/llvm/unittests/TargetParser/Host.cpp b/llvm/unittests/TargetParser/Host.cpp
+index 452d0326c..4b4c81514 100644
+--- a/llvm/unittests/TargetParser/Host.cpp
++++ b/llvm/unittests/TargetParser/Host.cpp
+@@ -250,6 +250,11 @@ CPU part	: 0x0a1
+   EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x48\n"
+                                               "CPU part        : 0xd01"),
+             "tsv110");
++#if defined(ENABLE_AARCH64_HIP09)
++  EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x48\n"
++                                              "CPU part        : 0xd02"),
++            "hip09");
++#endif
+ 
+   // Verify A64FX.
+   const std::string A64FXProcCpuInfo = R"(
+diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
+index 741d5a2d4..94e0047e5 100644
+--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
++++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
+@@ -1421,6 +1421,18 @@ INSTANTIATE_TEST_SUITE_P(
+                              AArch64::AEK_PROFILE | AArch64::AEK_FP16 |
+                              AArch64::AEK_FP16FML | AArch64::AEK_DOTPROD,
+                          "8.2-A"),
++#if defined(ENABLE_AARCH64_HIP09)
++        ARMCPUTestParams(
++            "hip09", "armv8.5-a", "crypto-neon-fp-armv8",
++            AArch64::AEK_CRC | AArch64::AEK_FP | AArch64::AEK_SIMD |
++                AArch64::AEK_RAS | AArch64::AEK_LSE | AArch64::AEK_RDM |
++                AArch64::AEK_RCPC | AArch64::AEK_DOTPROD | AArch64::AEK_AES |
++                AArch64::AEK_SM4 | AArch64::AEK_SHA2 | AArch64::AEK_SHA3 |
++                AArch64::AEK_FP16 | AArch64::AEK_PROFILE |
++                AArch64::AEK_FP16FML | AArch64::AEK_SVE | AArch64::AEK_I8MM |
++                AArch64::AEK_F32MM | AArch64::AEK_F64MM | AArch64::AEK_BF16,
++            "8.5-A"),
++#endif
+         ARMCPUTestParams("a64fx", "armv8.2-a", "crypto-neon-fp-armv8",
+                          AArch64::AEK_CRC | AArch64::AEK_AES |
+                              AArch64::AEK_SHA2 | AArch64::AEK_FP |
+@@ -1437,7 +1449,11 @@ INSTANTIATE_TEST_SUITE_P(
+                          "8.2-A")));
+ 
+ // Note: number of CPUs includes aliases.
++#if defined(ENABLE_AARCH64_HIP09)
++static constexpr unsigned NumAArch64CPUArchs = 63;
++#else
+ static constexpr unsigned NumAArch64CPUArchs = 62;
++#endif
+ 
+ TEST(TargetParserTest, testAArch64CPUArchList) {
+   SmallVector<StringRef, NumAArch64CPUArchs> List;
+-- 
+2.19.1
+
diff --git a/0024-Backport-LoongArch-fix-and-add-some-new-support.patch b/0024-Backport-LoongArch-fix-and-add-some-new-support.patch
new file mode 100644
index 0000000..9681d0c
--- /dev/null
+++ b/0024-Backport-LoongArch-fix-and-add-some-new-support.patch
@@ -0,0 +1,5463 @@
+From 53a624f1fbb2d1f837070b400812e8bddf66fd3d Mon Sep 17 00:00:00 2001
+From: Lu Weining <luweining@loongson.cn>
+Date: Tue, 5 Dec 2023 09:20:48 +0800
+Subject: [PATCH 01/12] [BinaryFormat][LoongArch] Define psABI v2.20 relocs for
+ R_LARCH_CALL36(#73345)
+
+R_LARCH_CALL36 was designed for function call on medium code model where
+the 2 instructions (pcaddu18i + jirl) must be adjacent.
+
+(cherry picked from commit c3a9c905fbc486add75e16218fe58a04b7b6c282)
+---
+ llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def      | 6 ++++++
+ .../tools/llvm-readobj/ELF/reloc-types-loongarch64.test     | 2 ++
+ llvm/unittests/Object/ELFTest.cpp                           | 2 ++
+ 3 files changed, 10 insertions(+)
+
+diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def
+index 02bce3c71712..c4393432677b 100644
+--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def
++++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/LoongArch.def
+@@ -118,3 +118,9 @@ ELF_RELOC(R_LARCH_SUB6,        106)
+ ELF_RELOC(R_LARCH_ADD_ULEB128, 107)
+ ELF_RELOC(R_LARCH_SUB_ULEB128, 108)
+ ELF_RELOC(R_LARCH_64_PCREL,    109)
++
++// Relocs added in ELF for the LoongArch™ Architecture v20231102, part of the
++// v2.20 LoongArch ABI specs.
++//
++// Spec addition: https://github.com/loongson/la-abi-specs/pull/4
++ELF_RELOC(R_LARCH_CALL36, 110)
+diff --git a/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test b/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test
+index e32dc893fa79..88ff7fa405ed 100644
+--- a/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test
++++ b/llvm/test/tools/llvm-readobj/ELF/reloc-types-loongarch64.test
+@@ -102,6 +102,7 @@
+ # CHECK: Type: R_LARCH_ADD_ULEB128 (107)
+ # CHECK: Type: R_LARCH_SUB_ULEB128 (108)
+ # CHECK: Type: R_LARCH_64_PCREL (109)
++# CHECK: Type: R_LARCH_CALL36 (110)
+ 
+ --- !ELF
+ FileHeader:
+@@ -211,3 +212,4 @@ Sections:
+       - Type: R_LARCH_ADD_ULEB128
+       - Type: R_LARCH_SUB_ULEB128
+       - Type: R_LARCH_64_PCREL
++      - Type: R_LARCH_CALL36
+diff --git a/llvm/unittests/Object/ELFTest.cpp b/llvm/unittests/Object/ELFTest.cpp
+index 50b1df124a4a..ed851dde4c00 100644
+--- a/llvm/unittests/Object/ELFTest.cpp
++++ b/llvm/unittests/Object/ELFTest.cpp
+@@ -251,6 +251,8 @@ TEST(ELFTest, getELFRelocationTypeNameForLoongArch) {
+             getELFRelocationTypeName(EM_LOONGARCH, R_LARCH_SUB_ULEB128));
+   EXPECT_EQ("R_LARCH_64_PCREL",
+             getELFRelocationTypeName(EM_LOONGARCH, R_LARCH_64_PCREL));
++  EXPECT_EQ("R_LARCH_CALL36",
++            getELFRelocationTypeName(EM_LOONGARCH, R_LARCH_CALL36));
+ }
+ 
+ TEST(ELFTest, getELFRelativeRelocationType) {
+-- 
+2.20.1
+
+
+From a8ed0f26220bbacb2c485a392f79ac4b271d73af Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Tue, 2 Jan 2024 10:55:02 +0800
+Subject: [PATCH 02/12] [LoongArch] Emit function call code sequence as
+ `PCADDU18I+JIRL` in medium code model
+
+According to the description of the psABI v2.20:
+https://github.com/loongson/la-abi-specs/releases/tag/v2.20, adjustments
+are made to the function call instructions under the medium code model.
+
+At the same time, AsmParser has already supported parsing the call36 and
+tail36 macro instructions.
+
+(cherry picked from commit 2cf420d5b846a4733ef0ef7c8ed0ae0bfd1c6772)
+---
+ .../AsmParser/LoongArchAsmParser.cpp          | 61 +++++++++++++++++++
+ .../LoongArch/LoongArchExpandPseudoInsts.cpp  | 29 ++++-----
+ .../Target/LoongArch/LoongArchInstrInfo.td    | 23 ++++++-
+ .../Target/LoongArch/LoongArchMCInstLower.cpp |  3 +
+ .../LoongArch/LoongArchTargetMachine.cpp      |  4 +-
+ .../MCTargetDesc/LoongArchBaseInfo.h          |  1 +
+ .../MCTargetDesc/LoongArchELFObjectWriter.cpp |  2 +
+ .../MCTargetDesc/LoongArchFixupKinds.h        |  3 +
+ .../MCTargetDesc/LoongArchMCCodeEmitter.cpp   |  3 +
+ .../MCTargetDesc/LoongArchMCExpr.cpp          |  3 +
+ .../LoongArch/MCTargetDesc/LoongArchMCExpr.h  |  1 +
+ llvm/test/CodeGen/LoongArch/code-models.ll    | 12 ++--
+ .../MC/LoongArch/Basic/Integer/invalid64.s    |  2 +-
+ llvm/test/MC/LoongArch/Macros/macros-call.s   |  9 +++
+ .../MC/LoongArch/Relocations/relocations.s    |  5 ++
+ 15 files changed, 133 insertions(+), 28 deletions(-)
+ create mode 100644 llvm/test/MC/LoongArch/Macros/macros-call.s
+
+diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+index a132e645c864..f908e5bc63d3 100644
+--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
++++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+@@ -122,6 +122,10 @@ class LoongArchAsmParser : public MCTargetAsmParser {
+   // Helper to emit pseudo instruction "li.w/d $rd, $imm".
+   void emitLoadImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
+ 
++  // Helper to emit pseudo instruction "call36 sym" or "tail36 $rj, sym".
++  void emitFuncCall36(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
++                      bool IsTailCall);
++
+ public:
+   enum LoongArchMatchResultTy {
+     Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY,
+@@ -401,6 +405,22 @@ public:
+                      IsValidKind;
+   }
+ 
++  bool isSImm20pcaddu18i() const {
++    if (!isImm())
++      return false;
++
++    int64_t Imm;
++    LoongArchMCExpr::VariantKind VK = LoongArchMCExpr::VK_LoongArch_None;
++    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
++    bool IsValidKind = VK == LoongArchMCExpr::VK_LoongArch_None ||
++                       VK == LoongArchMCExpr::VK_LoongArch_CALL36;
++
++    return IsConstantImm
++               ? isInt<20>(Imm) && IsValidKind
++               : LoongArchAsmParser::classifySymbolRef(getImm(), VK) &&
++                     IsValidKind;
++  }
++
+   bool isSImm21lsl2() const {
+     if (!isImm())
+       return false;
+@@ -1111,6 +1131,35 @@ void LoongArchAsmParser::emitLoadImm(MCInst &Inst, SMLoc IDLoc,
+   }
+ }
+ 
++void LoongArchAsmParser::emitFuncCall36(MCInst &Inst, SMLoc IDLoc,
++                                        MCStreamer &Out, bool IsTailCall) {
++  // call36 sym
++  // expands to:
++  //   pcaddu18i $ra, %call36(sym)
++  //   jirl      $ra, $ra, 0
++  //
++  // tail36 $rj, sym
++  // expands to:
++  //   pcaddu18i $rj, %call36(sym)
++  //   jirl      $r0, $rj, 0
++  unsigned ScratchReg =
++      IsTailCall ? Inst.getOperand(0).getReg() : (unsigned)LoongArch::R1;
++  const MCExpr *Sym =
++      IsTailCall ? Inst.getOperand(1).getExpr() : Inst.getOperand(0).getExpr();
++  const LoongArchMCExpr *LE = LoongArchMCExpr::create(
++      Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36, getContext());
++
++  Out.emitInstruction(
++      MCInstBuilder(LoongArch::PCADDU18I).addReg(ScratchReg).addExpr(LE),
++      getSTI());
++  Out.emitInstruction(
++      MCInstBuilder(LoongArch::JIRL)
++          .addReg(IsTailCall ? (unsigned)LoongArch::R0 : ScratchReg)
++          .addReg(ScratchReg)
++          .addImm(0),
++      getSTI());
++}
++
+ bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
+                                             OperandVector &Operands,
+                                             MCStreamer &Out) {
+@@ -1159,6 +1208,12 @@ bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
+   case LoongArch::PseudoLI_D:
+     emitLoadImm(Inst, IDLoc, Out);
+     return false;
++  case LoongArch::PseudoCALL36:
++    emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/false);
++    return false;
++  case LoongArch::PseudoTAIL36:
++    emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/true);
++    return false;
+   }
+   Out.emitInstruction(Inst, getSTI());
+   return false;
+@@ -1440,6 +1495,12 @@ bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+         /*Upper=*/(1 << 19) - 1,
+         "operand must be a symbol with modifier (e.g. %pc_hi20) or an integer "
+         "in the range");
++  case Match_InvalidSImm20pcaddu18i:
++    return generateImmOutOfRangeError(
++        Operands, ErrorInfo, /*Lower=*/-(1 << 19),
++        /*Upper=*/(1 << 19) - 1,
++        "operand must be a symbol with modifier (e.g. %call36) or an integer "
++        "in the range");
+   case Match_InvalidSImm21lsl2:
+     return generateImmOutOfRangeError(
+         Operands, ErrorInfo, /*Lower=*/-(1 << 22), /*Upper=*/(1 << 22) - 4,
+diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+index 72c1f1cec198..8eda2dcc1633 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+@@ -458,11 +458,11 @@ bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
+   }
+   case CodeModel::Medium: {
+     // CALL:
+-    // pcalau12i  $ra, %pc_hi20(func)
+-    // jirl       $ra, $ra, %pc_lo12(func)
++    // pcaddu18i $ra, %call36(func)
++    // jirl      $ra, $ra, 0
+     // TAIL:
+-    // pcalau12i  $scratch, %pc_hi20(func)
+-    // jirl       $r0, $scratch, %pc_lo12(func)
++    // pcaddu18i $scratch, %call36(func)
++    // jirl      $r0, $scratch, 0
+     Opcode =
+         IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+     Register ScratchReg =
+@@ -470,18 +470,15 @@ bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
+             ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+             : LoongArch::R1;
+     MachineInstrBuilder MIB =
+-        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), ScratchReg);
+-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg);
+-    if (Func.isSymbol()) {
+-      const char *FnName = Func.getSymbolName();
+-      MIB.addExternalSymbol(FnName, LoongArchII::MO_PCREL_HI);
+-      CALL.addExternalSymbol(FnName, LoongArchII::MO_PCREL_LO);
+-      break;
+-    }
+-    assert(Func.isGlobal() && "Expected a GlobalValue at this time");
+-    const GlobalValue *GV = Func.getGlobal();
+-    MIB.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_HI);
+-    CALL.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_LO);
++        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
++
++    CALL =
++        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
++
++    if (Func.isSymbol())
++      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
++    else
++      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
+     break;
+   }
+   case CodeModel::Large: {
+diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+index ab1890556814..67de5f7afd78 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+@@ -351,6 +351,10 @@ def simm20_lu32id : SImm20Operand {
+   let ParserMatchClass = SImmAsmOperand<20, "lu32id">;
+ }
+ 
++def simm20_pcaddu18i : SImm20Operand {
++  let ParserMatchClass = SImmAsmOperand<20, "pcaddu18i">;
++}
++
+ def simm21_lsl2 : Operand<OtherVT> {
+   let ParserMatchClass = SImmAsmOperand<21, "lsl2">;
+   let EncoderMethod = "getImmOpValueAsr<2>";
+@@ -772,7 +776,7 @@ def LU32I_D : Fmt1RI20<0x16000000, (outs GPR:$dst),
+                        "$rd, $imm20">;
+ }
+ def LU52I_D : ALU_2RI12<0x03000000, simm12_lu52id>;
+-def PCADDU18I : ALU_1RI20<0x1e000000, simm20>;
++def PCADDU18I : ALU_1RI20<0x1e000000, simm20_pcaddu18i>;
+ def MUL_D     : ALU_3R<0x001d8000>;
+ def MULH_D    : ALU_3R<0x001e0000>;
+ def MULH_DU   : ALU_3R<0x001e8000>;
+@@ -1324,7 +1328,7 @@ def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)),
+           (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>;
+ 
+ let isCall = 1, Defs = [R1] in
+-def PseudoCALL : Pseudo<(outs), (ins simm26_symbol:$func)>;
++def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func)>;
+ 
+ def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;
+ def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
+@@ -1344,7 +1348,7 @@ def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>,
+                 PseudoInstExpansion<(JIRL R0, R1, 0)>;
+ 
+ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in
+-def PseudoTAIL : Pseudo<(outs), (ins simm26_symbol:$dst)>;
++def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst)>;
+ 
+ def : Pat<(loongarch_tail (iPTR tglobaladdr:$dst)),
+           (PseudoTAIL tglobaladdr:$dst)>;
+@@ -1367,6 +1371,19 @@ def PseudoJIRL_TAIL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>,
+                       PseudoInstExpansion<(JIRL R0, GPR:$rj,
+                                            simm16_lsl2:$imm16)>;
+ 
++/// call36/taill36 macro instructions
++let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, isAsmParserOnly = 1,
++    Defs = [R1], Size = 8, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in
++def PseudoCALL36 : Pseudo<(outs), (ins bare_symbol:$dst), [],
++                          "call36", "$dst">,
++                   Requires<[IsLA64]>;
++let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3],
++    isCodeGenOnly = 0, isAsmParserOnly = 1, Size = 8, hasSideEffects = 0,
++    mayStore = 0, mayLoad = 0 in
++def PseudoTAIL36 : Pseudo<(outs), (ins GPR:$tmp, bare_symbol:$dst), [],
++                          "tail36", "$tmp, $dst">,
++                   Requires<[IsLA64]>;
++
+ /// Load address (la*) macro instructions.
+ 
+ // Define isCodeGenOnly = 0 to expose them to tablegened assembly parser.
+diff --git a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
+index 5daa9481c907..98ad49f25e3f 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp
+@@ -95,6 +95,9 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
+   case LoongArchII::MO_GD_PC_HI:
+     Kind = LoongArchMCExpr::VK_LoongArch_TLS_GD_PC_HI20;
+     break;
++  case LoongArchII::MO_CALL36:
++    Kind = LoongArchMCExpr::VK_LoongArch_CALL36;
++    break;
+     // TODO: Handle more target-flags.
+   }
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+index d0a4e9375048..0efc5e6ebb99 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+@@ -63,11 +63,11 @@ getEffectiveLoongArchCodeModel(const Triple &TT,
+ 
+   switch (*CM) {
+   case CodeModel::Small:
+-  case CodeModel::Medium:
+     return *CM;
++  case CodeModel::Medium:
+   case CodeModel::Large:
+     if (!TT.isArch64Bit())
+-      report_fatal_error("Large code model requires LA64");
++      report_fatal_error("Medium/Large code model requires LA64");
+     return *CM;
+   default:
+     report_fatal_error(
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
+index cee6dad1f095..0692cb92b694 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h
+@@ -47,6 +47,7 @@ enum {
+   MO_IE_PC64_HI,
+   MO_LD_PC_HI,
+   MO_GD_PC_HI,
++  MO_CALL36
+   // TODO: Add more flags.
+ };
+ } // end namespace LoongArchII
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+index e60b9c2cfd97..0a52380dd2cd 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp
+@@ -90,6 +90,8 @@ unsigned LoongArchELFObjectWriter::getRelocType(MCContext &Ctx,
+     return ELF::R_LARCH_TLS_LE64_LO20;
+   case LoongArch::fixup_loongarch_tls_le64_hi12:
+     return ELF::R_LARCH_TLS_LE64_HI12;
++  case LoongArch::fixup_loongarch_call36:
++    return ELF::R_LARCH_CALL36;
+     // TODO: Handle more fixup-kinds.
+   }
+ }
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+index 78414408f21f..0d19d2b0fb1f 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+@@ -111,6 +111,9 @@ enum Fixups {
+   fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX,
+   // Generate an R_LARCH_ALIGN which indicates the linker may fixup align here.
+   fixup_loongarch_align = FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN,
++  // 36-bit fixup corresponding to %call36(foo) for a pair instructions:
++  // pcaddu18i+jirl.
++  fixup_loongarch_call36 = FirstLiteralRelocationKind + ELF::R_LARCH_CALL36,
+ };
+ } // end namespace LoongArch
+ } // end namespace llvm
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+index 09d92ac9aa3a..7c4fe9674d4e 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+@@ -241,6 +241,9 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
+     case LoongArchMCExpr::VK_LoongArch_TLS_GD_HI20:
+       FixupKind = LoongArch::fixup_loongarch_tls_gd_hi20;
+       break;
++    case LoongArchMCExpr::VK_LoongArch_CALL36:
++      FixupKind = LoongArch::fixup_loongarch_call36;
++      break;
+     }
+   } else if (Kind == MCExpr::SymbolRef &&
+              cast<MCSymbolRefExpr>(Expr)->getKind() ==
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+index 82c992b1cc8c..8ca8876a19b9 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+@@ -138,6 +138,8 @@ StringRef LoongArchMCExpr::getVariantKindName(VariantKind Kind) {
+     return "gd_pc_hi20";
+   case VK_LoongArch_TLS_GD_HI20:
+     return "gd_hi20";
++  case VK_LoongArch_CALL36:
++    return "call36";
+   }
+ }
+ 
+@@ -180,6 +182,7 @@ LoongArchMCExpr::getVariantKindForName(StringRef name) {
+       .Case("ld_hi20", VK_LoongArch_TLS_LD_HI20)
+       .Case("gd_pc_hi20", VK_LoongArch_TLS_GD_PC_HI20)
+       .Case("gd_hi20", VK_LoongArch_TLS_GD_HI20)
++      .Case("call36", VK_LoongArch_CALL36)
+       .Default(VK_LoongArch_Invalid);
+ }
+ 
+diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+index 93251f824103..bd828116d7fa 100644
+--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
++++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+@@ -61,6 +61,7 @@ public:
+     VK_LoongArch_TLS_LD_HI20,
+     VK_LoongArch_TLS_GD_PC_HI20,
+     VK_LoongArch_TLS_GD_HI20,
++    VK_LoongArch_CALL36,
+     VK_LoongArch_Invalid // Must be the last item.
+   };
+ 
+diff --git a/llvm/test/CodeGen/LoongArch/code-models.ll b/llvm/test/CodeGen/LoongArch/code-models.ll
+index c610f645a06a..7c6f46d5e926 100644
+--- a/llvm/test/CodeGen/LoongArch/code-models.ll
++++ b/llvm/test/CodeGen/LoongArch/code-models.ll
+@@ -23,8 +23,8 @@ define i32 @call_globaladdress(i32 %a) nounwind {
+ ; MEDIUM:       # %bb.0:
+ ; MEDIUM-NEXT:    addi.d $sp, $sp, -16
+ ; MEDIUM-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+-; MEDIUM-NEXT:    pcalau12i $ra, %pc_hi20(callee)
+-; MEDIUM-NEXT:    jirl $ra, $ra, %pc_lo12(callee)
++; MEDIUM-NEXT:    pcaddu18i $ra, %call36(callee)
++; MEDIUM-NEXT:    jirl $ra, $ra, 0
+ ; MEDIUM-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+ ; MEDIUM-NEXT:    addi.d $sp, $sp, 16
+ ; MEDIUM-NEXT:    ret
+@@ -68,8 +68,8 @@ define void @call_external_sym(ptr %dst) {
+ ; MEDIUM-NEXT:    .cfi_offset 1, -8
+ ; MEDIUM-NEXT:    ori $a2, $zero, 1000
+ ; MEDIUM-NEXT:    move $a1, $zero
+-; MEDIUM-NEXT:    pcalau12i $ra, %pc_hi20(memset)
+-; MEDIUM-NEXT:    jirl $ra, $ra, %pc_lo12(memset)
++; MEDIUM-NEXT:    pcaddu18i $ra, %call36(memset)
++; MEDIUM-NEXT:    jirl $ra, $ra, 0
+ ; MEDIUM-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+ ; MEDIUM-NEXT:    addi.d $sp, $sp, 16
+ ; MEDIUM-NEXT:    ret
+@@ -105,8 +105,8 @@ define i32 @caller_tail(i32 %i) nounwind {
+ ;
+ ; MEDIUM-LABEL: caller_tail:
+ ; MEDIUM:       # %bb.0: # %entry
+-; MEDIUM-NEXT:    pcalau12i $a1, %pc_hi20(callee_tail)
+-; MEDIUM-NEXT:    jirl $zero, $a1, %pc_lo12(callee_tail)
++; MEDIUM-NEXT:    pcaddu18i $a1, %call36(callee_tail)
++; MEDIUM-NEXT:    jr $a1
+ ;
+ ; LARGE-LABEL: caller_tail:
+ ; LARGE:       # %bb.0: # %entry
+diff --git a/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s b/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
+index acddca9432a6..1c1c658ad440 100644
+--- a/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
++++ b/llvm/test/MC/LoongArch/Basic/Integer/invalid64.s
+@@ -65,7 +65,7 @@ addu16i.d $a0, $a0, 32768
+ 
+ ## simm20
+ pcaddu18i $a0, 0x80000
+-# CHECK: :[[#@LINE-1]]:16: error: immediate must be an integer in the range [-524288, 524287]
++# CHECK: :[[#@LINE-1]]:16: error: operand must be a symbol with modifier (e.g. %call36) or an integer in the range [-524288, 524287]
+ 
+ ## simm20_lu32id
+ lu32i.d $a0, 0x80000
+diff --git a/llvm/test/MC/LoongArch/Macros/macros-call.s b/llvm/test/MC/LoongArch/Macros/macros-call.s
+new file mode 100644
+index 000000000000..a648a3978038
+--- /dev/null
++++ b/llvm/test/MC/LoongArch/Macros/macros-call.s
+@@ -0,0 +1,9 @@
++# RUN: llvm-mc --triple=loongarch64 %s | FileCheck %s
++
++call36 sym_call
++# CHECK:      pcaddu18i $ra, %call36(sym_call)
++# CHECK-NEXT: jirl $ra, $ra, 0
++
++tail36 $t0, sym_tail
++# CHECK:      pcaddu18i $t0, %call36(sym_tail)
++# CHECK-NEXT: jr $t0
+diff --git a/llvm/test/MC/LoongArch/Relocations/relocations.s b/llvm/test/MC/LoongArch/Relocations/relocations.s
+index 042cc93470a1..bec71e103893 100644
+--- a/llvm/test/MC/LoongArch/Relocations/relocations.s
++++ b/llvm/test/MC/LoongArch/Relocations/relocations.s
+@@ -218,3 +218,8 @@ lu12i.w $t1, %gd_hi20(foo)
+ # RELOC: R_LARCH_TLS_GD_HI20 foo 0x0
+ # INSTR: lu12i.w $t1, %gd_hi20(foo)
+ # FIXUP: fixup A - offset: 0, value: %gd_hi20(foo), kind: FK_NONE
++
++pcaddu18i $t1, %call36(foo)
++# RELOC: R_LARCH_CALL36 foo 0x0
++# INSTR: pcaddu18i $t1, %call36(foo)
++# FIXUP: fixup A - offset: 0, value: %call36(foo), kind: FK_NONE
+-- 
+2.20.1
+
+
+From d59688f326d8f915ffc5db80b40c9b99d9f95470 Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Tue, 2 Jan 2024 10:57:40 +0800
+Subject: [PATCH 03/12] [LoongArch] Pre-commit test for #76555. NFC
+
+(cherry picked from commit 3d6fc35b9071009c5ef37f879a12982c6a54db60)
+---
+ .../LoongArch/psabi-restricted-scheduling.ll  | 172 ++++++++++++++++++
+ 1 file changed, 172 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+
+diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+new file mode 100644
+index 000000000000..150a935d7bf8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+@@ -0,0 +1,172 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
++; RUN: llc --mtriple=loongarch64 --code-model=medium --post-RA-scheduler=0 < %s \
++; RUN:     | FileCheck %s --check-prefix=MEDIUM_NO_SCH
++; RUN: llc --mtriple=loongarch64 --code-model=medium --post-RA-scheduler=1 < %s \
++; RUN:     | FileCheck %s --check-prefix=MEDIUM_SCH
++; RUN: llc --mtriple=loongarch64 --code-model=large --post-RA-scheduler=0 < %s \
++; RUN:     | FileCheck %s --check-prefix=LARGE_NO_SCH
++; RUN: llc --mtriple=loongarch64 --code-model=large --post-RA-scheduler=1 < %s \
++; RUN:     | FileCheck %s --check-prefix=LARGE_SCH
++
++;; FIXME: According to the description of the psABI v2.30, the code sequences
++;; of `PseudoLA*_LARGE` instruction and Medium code model's function call must
++;; be adjacent.
++
++@g = dso_local global i64 zeroinitializer, align 4
++@G = global i64 zeroinitializer, align 4
++@gd = external thread_local global i64
++@ld = external thread_local(localdynamic) global i64
++@ie = external thread_local(initialexec) global i64
++
++declare ptr @bar(i64)
++
++define void @foo() nounwind {
++; MEDIUM_NO_SCH-LABEL: foo:
++; MEDIUM_NO_SCH:       # %bb.0:
++; MEDIUM_NO_SCH-NEXT:    addi.d $sp, $sp, -16
++; MEDIUM_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
++; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
++; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
++; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, 0
++; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
++; MEDIUM_NO_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
++; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, 0
++; MEDIUM_NO_SCH-NEXT:    ori $a0, $zero, 1
++; MEDIUM_NO_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
++; MEDIUM_NO_SCH-NEXT:    jirl $ra, $ra, 0
++; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
++; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
++; MEDIUM_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
++; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ld)
++; MEDIUM_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
++; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ie)
++; MEDIUM_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; MEDIUM_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
++; MEDIUM_NO_SCH-NEXT:    addi.d $sp, $sp, 16
++; MEDIUM_NO_SCH-NEXT:    ret
++;
++; MEDIUM_SCH-LABEL: foo:
++; MEDIUM_SCH:       # %bb.0:
++; MEDIUM_SCH-NEXT:    addi.d $sp, $sp, -16
++; MEDIUM_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
++; MEDIUM_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
++; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
++; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
++; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
++; MEDIUM_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
++; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
++; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
++; MEDIUM_SCH-NEXT:    ori $a0, $zero, 1
++; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
++; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
++; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
++; MEDIUM_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
++; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ld)
++; MEDIUM_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
++; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(ie)
++; MEDIUM_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; MEDIUM_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
++; MEDIUM_SCH-NEXT:    addi.d $sp, $sp, 16
++; MEDIUM_SCH-NEXT:    ret
++;
++; LARGE_NO_SCH-LABEL: foo:
++; LARGE_NO_SCH:       # %bb.0:
++; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, -16
++; LARGE_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
++; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
++; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
++; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
++; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
++; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
++; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
++; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(g)
++; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
++; LARGE_NO_SCH-NEXT:    add.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
++; LARGE_NO_SCH-NEXT:    ori $a0, $zero, 1
++; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
++; LARGE_NO_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
++; LARGE_NO_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
++; LARGE_NO_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
++; LARGE_NO_SCH-NEXT:    ldx.d $ra, $ra, $a1
++; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
++; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
++; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(gd)
++; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(gd)
++; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(gd)
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
++; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
++; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
++; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
++; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
++; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
++; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; LARGE_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
++; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, 16
++; LARGE_NO_SCH-NEXT:    ret
++;
++; LARGE_SCH-LABEL: foo:
++; LARGE_SCH:       # %bb.0:
++; LARGE_SCH-NEXT:    addi.d $sp, $sp, -16
++; LARGE_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
++; LARGE_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
++; LARGE_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
++; LARGE_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
++; LARGE_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
++; LARGE_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
++; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
++; LARGE_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
++; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
++; LARGE_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(g)
++; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
++; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
++; LARGE_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
++; LARGE_SCH-NEXT:    add.d $a0, $a1, $a0
++; LARGE_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
++; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
++; LARGE_SCH-NEXT:    ldx.d $ra, $ra, $a1
++; LARGE_SCH-NEXT:    ori $a0, $zero, 1
++; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
++; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(gd)
++; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
++; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(gd)
++; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(gd)
++; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
++; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
++; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
++; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
++; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
++; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
++; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
++; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
++; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
++; LARGE_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
++; LARGE_SCH-NEXT:    addi.d $sp, $sp, 16
++; LARGE_SCH-NEXT:    ret
++  %V = load volatile i64, ptr @G
++  %v = load volatile i64, ptr @g
++  call void @bar(i64 1)
++  %v_gd = load volatile i64, ptr @gd
++  %v_ld = load volatile i64, ptr @ld
++  %v_ie = load volatile i64, ptr @ie
++  ret void
++}
+-- 
+2.20.1
+
+
+From 1248440ab618fcffada7fa29eed71bc04945c3ec Mon Sep 17 00:00:00 2001
+From: Weining Lu <luweining@loongson.cn>
+Date: Tue, 25 Jun 2024 09:52:17 +0800
+Subject: [PATCH 04/12] [LoongArch][test] Remove the FIXME in
+ psabi-restricted-scheduling.ll which has been addressed by #76555
+
+(cherry picked from commit 7ea63b9db4198688873036f3b0b81f9124076f7a)
+---
+ llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll | 4 ----
+ 1 file changed, 4 deletions(-)
+
+diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+index 150a935d7bf8..a515939b9c2b 100644
+--- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
++++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+@@ -8,10 +8,6 @@
+ ; RUN: llc --mtriple=loongarch64 --code-model=large --post-RA-scheduler=1 < %s \
+ ; RUN:     | FileCheck %s --check-prefix=LARGE_SCH
+ 
+-;; FIXME: According to the description of the psABI v2.30, the code sequences
+-;; of `PseudoLA*_LARGE` instruction and Medium code model's function call must
+-;; be adjacent.
+-
+ @g = dso_local global i64 zeroinitializer, align 4
+ @G = global i64 zeroinitializer, align 4
+ @gd = external thread_local global i64
+-- 
+2.20.1
+
+
+From 0e86ae628414dac6d7ef2eaccc8655d790595f9f Mon Sep 17 00:00:00 2001
+From: wanglei <wanglei@loongson.cn>
+Date: Tue, 2 Jan 2024 10:57:15 +0800
+Subject: [PATCH 05/12] [LoongArch] Reimplement the expansion of
+ PseudoLA*_LARGE instructions (#76555)
+
+According to the description of the psABI v2.30:
+https://github.com/loongson/la-abi-specs/releases/tag/v2.30, moved the
+expansion of relevant pseudo-instructions from
+`LoongArchPreRAExpandPseudo` pass to `LoongArchExpandPseudo` pass, to
+ensure that the code sequences of `PseudoLA*_LARGE` instructions and
+Medium code model's function call are not scheduled.
+
+(cherry picked from commit c56a5e895a96fec4292e9333d998cfa88770432a)
+---
+ .../LoongArch/LoongArchExpandPseudoInsts.cpp  | 519 +++++++++---------
+ .../LoongArch/LoongArchISelLowering.cpp       |  24 +-
+ .../Target/LoongArch/LoongArchISelLowering.h  |   4 +
+ .../Target/LoongArch/LoongArchInstrInfo.td    |  83 ++-
+ llvm/test/CodeGen/LoongArch/code-models.ll    |  36 +-
+ llvm/test/CodeGen/LoongArch/expand-call.ll    |   2 +-
+ llvm/test/CodeGen/LoongArch/global-address.ll |  32 +-
+ .../LoongArch/psabi-restricted-scheduling.ll  | 102 ++--
+ llvm/test/CodeGen/LoongArch/tls-models.ll     |  68 +--
+ 9 files changed, 487 insertions(+), 383 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+index 8eda2dcc1633..f977f176066a 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+@@ -62,43 +62,24 @@ private:
+                                MachineBasicBlock::iterator &NextMBBI,
+                                unsigned FlagsHi, unsigned SecondOpcode,
+                                unsigned FlagsLo);
+-  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
+-                              MachineBasicBlock::iterator MBBI,
+-                              MachineBasicBlock::iterator &NextMBBI,
+-                              unsigned LastOpcode, unsigned IdentifyingMO);
+-  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
+-                              MachineBasicBlock::iterator MBBI,
+-                              MachineBasicBlock::iterator &NextMBBI,
+-                              unsigned LastOpcode, unsigned IdentifyingMO,
+-                              const MachineOperand &Symbol, Register DestReg,
+-                              bool EraseFromParent);
+   bool expandLoadAddressPcrel(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+-                              MachineBasicBlock::iterator &NextMBBI,
+-                              bool Large = false);
++                              MachineBasicBlock::iterator &NextMBBI);
+   bool expandLoadAddressGot(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI,
+-                            MachineBasicBlock::iterator &NextMBBI,
+-                            bool Large = false);
++                            MachineBasicBlock::iterator &NextMBBI);
+   bool expandLoadAddressTLSLE(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+                               MachineBasicBlock::iterator &NextMBBI);
+   bool expandLoadAddressTLSIE(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+-                              MachineBasicBlock::iterator &NextMBBI,
+-                              bool Large = false);
++                              MachineBasicBlock::iterator &NextMBBI);
+   bool expandLoadAddressTLSLD(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+-                              MachineBasicBlock::iterator &NextMBBI,
+-                              bool Large = false);
++                              MachineBasicBlock::iterator &NextMBBI);
+   bool expandLoadAddressTLSGD(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+-                              MachineBasicBlock::iterator &NextMBBI,
+-                              bool Large = false);
+-  bool expandFunctionCALL(MachineBasicBlock &MBB,
+-                          MachineBasicBlock::iterator MBBI,
+-                          MachineBasicBlock::iterator &NextMBBI,
+-                          bool IsTailCall);
++                              MachineBasicBlock::iterator &NextMBBI);
+ };
+ 
+ char LoongArchPreRAExpandPseudo::ID = 0;
+@@ -131,30 +112,16 @@ bool LoongArchPreRAExpandPseudo::expandMI(
+   switch (MBBI->getOpcode()) {
+   case LoongArch::PseudoLA_PCREL:
+     return expandLoadAddressPcrel(MBB, MBBI, NextMBBI);
+-  case LoongArch::PseudoLA_PCREL_LARGE:
+-    return expandLoadAddressPcrel(MBB, MBBI, NextMBBI, /*Large=*/true);
+   case LoongArch::PseudoLA_GOT:
+     return expandLoadAddressGot(MBB, MBBI, NextMBBI);
+-  case LoongArch::PseudoLA_GOT_LARGE:
+-    return expandLoadAddressGot(MBB, MBBI, NextMBBI, /*Large=*/true);
+   case LoongArch::PseudoLA_TLS_LE:
+     return expandLoadAddressTLSLE(MBB, MBBI, NextMBBI);
+   case LoongArch::PseudoLA_TLS_IE:
+     return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI);
+-  case LoongArch::PseudoLA_TLS_IE_LARGE:
+-    return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI, /*Large=*/true);
+   case LoongArch::PseudoLA_TLS_LD:
+     return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI);
+-  case LoongArch::PseudoLA_TLS_LD_LARGE:
+-    return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI, /*Large=*/true);
+   case LoongArch::PseudoLA_TLS_GD:
+     return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI);
+-  case LoongArch::PseudoLA_TLS_GD_LARGE:
+-    return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI, /*Large=*/true);
+-  case LoongArch::PseudoCALL:
+-    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false);
+-  case LoongArch::PseudoTAIL:
+-    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);
+   }
+   return false;
+ }
+@@ -187,118 +154,9 @@ bool LoongArchPreRAExpandPseudo::expandPcalau12iInstPair(
+   return true;
+ }
+ 
+-bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad(
+-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+-    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
+-    unsigned IdentifyingMO) {
+-  MachineInstr &MI = *MBBI;
+-  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO,
+-                                MI.getOperand(2), MI.getOperand(0).getReg(),
+-                                true);
+-}
+-
+-bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad(
+-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+-    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
+-    unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg,
+-    bool EraseFromParent) {
+-  // Code Sequence:
+-  //
+-  // Part1: pcalau12i  $scratch, %MO1(sym)
+-  // Part0: addi.d     $dest, $zero, %MO0(sym)
+-  // Part2: lu32i.d    $dest, %MO2(sym)
+-  // Part3: lu52i.d    $dest, $dest, %MO3(sym)
+-  // Fin:   LastOpcode $dest, $dest, $scratch
+-
+-  unsigned MO0, MO1, MO2, MO3;
+-  switch (IdentifyingMO) {
+-  default:
+-    llvm_unreachable("unsupported identifying MO");
+-  case LoongArchII::MO_PCREL_LO:
+-    MO0 = IdentifyingMO;
+-    MO1 = LoongArchII::MO_PCREL_HI;
+-    MO2 = LoongArchII::MO_PCREL64_LO;
+-    MO3 = LoongArchII::MO_PCREL64_HI;
+-    break;
+-  case LoongArchII::MO_GOT_PC_HI:
+-  case LoongArchII::MO_LD_PC_HI:
+-  case LoongArchII::MO_GD_PC_HI:
+-    // These cases relocate just like the GOT case, except for Part1.
+-    MO0 = LoongArchII::MO_GOT_PC_LO;
+-    MO1 = IdentifyingMO;
+-    MO2 = LoongArchII::MO_GOT_PC64_LO;
+-    MO3 = LoongArchII::MO_GOT_PC64_HI;
+-    break;
+-  case LoongArchII::MO_IE_PC_LO:
+-    MO0 = IdentifyingMO;
+-    MO1 = LoongArchII::MO_IE_PC_HI;
+-    MO2 = LoongArchII::MO_IE_PC64_LO;
+-    MO3 = LoongArchII::MO_IE_PC64_HI;
+-    break;
+-  }
+-
+-  MachineFunction *MF = MBB.getParent();
+-  MachineInstr &MI = *MBBI;
+-  DebugLoc DL = MI.getDebugLoc();
+-
+-  assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() &&
+-         "Large code model requires LA64");
+-
+-  Register TmpPart1 =
+-      MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
+-  Register TmpPart0 =
+-      DestReg.isVirtual()
+-          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+-          : DestReg;
+-  Register TmpParts02 =
+-      DestReg.isVirtual()
+-          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+-          : DestReg;
+-  Register TmpParts023 =
+-      DestReg.isVirtual()
+-          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+-          : DestReg;
+-
+-  auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), TmpPart1);
+-  auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), TmpPart0)
+-                   .addReg(LoongArch::R0);
+-  auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), TmpParts02)
+-                   // "rj" is needed due to InstrInfo pattern requirement.
+-                   .addReg(TmpPart0, RegState::Kill);
+-  auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), TmpParts023)
+-                   .addReg(TmpParts02, RegState::Kill);
+-  BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg)
+-      .addReg(TmpParts023)
+-      .addReg(TmpPart1, RegState::Kill);
+-
+-  if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) {
+-    const char *SymName = Symbol.getSymbolName();
+-    Part0.addExternalSymbol(SymName, MO0);
+-    Part1.addExternalSymbol(SymName, MO1);
+-    Part2.addExternalSymbol(SymName, MO2);
+-    Part3.addExternalSymbol(SymName, MO3);
+-  } else {
+-    Part0.addDisp(Symbol, 0, MO0);
+-    Part1.addDisp(Symbol, 0, MO1);
+-    Part2.addDisp(Symbol, 0, MO2);
+-    Part3.addDisp(Symbol, 0, MO3);
+-  }
+-
+-  if (EraseFromParent)
+-    MI.eraseFromParent();
+-
+-  return true;
+-}
+-
+ bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(
+     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+-  if (Large)
+-    // Emit the 5-insn large address load sequence with the `%pc` family of
+-    // relocs.
+-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+-                                  LoongArchII::MO_PCREL_LO);
+-
++    MachineBasicBlock::iterator &NextMBBI) {
+   // Code Sequence:
+   // pcalau12i $rd, %pc_hi20(sym)
+   // addi.w/d $rd, $rd, %pc_lo12(sym)
+@@ -311,13 +169,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(
+ 
+ bool LoongArchPreRAExpandPseudo::expandLoadAddressGot(
+     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+-  if (Large)
+-    // Emit the 5-insn large address load sequence with the `%got_pc` family
+-    // of relocs, loading the result from GOT with `ldx.d` in the end.
+-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
+-                                  LoongArchII::MO_GOT_PC_HI);
+-
++    MachineBasicBlock::iterator &NextMBBI) {
+   // Code Sequence:
+   // pcalau12i $rd, %got_pc_hi20(sym)
+   // ld.w/d $rd, $rd, %got_pc_lo12(sym)
+@@ -378,13 +230,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLE(
+ 
+ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(
+     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+-  if (Large)
+-    // Emit the 5-insn large address load sequence with the `%ie_pc` family
+-    // of relocs, loading the result with `ldx.d` in the end.
+-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
+-                                  LoongArchII::MO_IE_PC_LO);
+-
++    MachineBasicBlock::iterator &NextMBBI) {
+   // Code Sequence:
+   // pcalau12i $rd, %ie_pc_hi20(sym)
+   // ld.w/d $rd, $rd, %ie_pc_lo12(sym)
+@@ -397,13 +243,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(
+ 
+ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(
+     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+-  if (Large)
+-    // Emit the 5-insn large address load sequence with the `%got_pc` family
+-    // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`.
+-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+-                                  LoongArchII::MO_LD_PC_HI);
+-
++    MachineBasicBlock::iterator &NextMBBI) {
+   // Code Sequence:
+   // pcalau12i $rd, %ld_pc_hi20(sym)
+   // addi.w/d $rd, $rd, %got_pc_lo12(sym)
+@@ -416,13 +256,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(
+ 
+ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(
+     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+-    MachineBasicBlock::iterator &NextMBBI, bool Large) {
+-  if (Large)
+-    // Emit the 5-insn large address load sequence with the `%got_pc` family
+-    // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`.
+-    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
+-                                  LoongArchII::MO_GD_PC_HI);
+-
++    MachineBasicBlock::iterator &NextMBBI) {
+   // Code Sequence:
+   // pcalau12i $rd, %gd_pc_hi20(sym)
+   // addi.w/d $rd, $rd, %got_pc_lo12(sym)
+@@ -433,85 +267,6 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(
+                                  SecondOpcode, LoongArchII::MO_GOT_PC_LO);
+ }
+ 
+-bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
+-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+-    MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) {
+-  MachineFunction *MF = MBB.getParent();
+-  MachineInstr &MI = *MBBI;
+-  DebugLoc DL = MI.getDebugLoc();
+-  const MachineOperand &Func = MI.getOperand(0);
+-  MachineInstrBuilder CALL;
+-  unsigned Opcode;
+-
+-  switch (MF->getTarget().getCodeModel()) {
+-  default:
+-    report_fatal_error("Unsupported code model");
+-    break;
+-  case CodeModel::Small: {
+-    // CALL:
+-    // bl func
+-    // TAIL:
+-    // b func
+-    Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL;
+-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func);
+-    break;
+-  }
+-  case CodeModel::Medium: {
+-    // CALL:
+-    // pcaddu18i $ra, %call36(func)
+-    // jirl      $ra, $ra, 0
+-    // TAIL:
+-    // pcaddu18i $scratch, %call36(func)
+-    // jirl      $r0, $scratch, 0
+-    Opcode =
+-        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+-    Register ScratchReg =
+-        IsTailCall
+-            ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+-            : LoongArch::R1;
+-    MachineInstrBuilder MIB =
+-        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
+-
+-    CALL =
+-        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
+-
+-    if (Func.isSymbol())
+-      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
+-    else
+-      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
+-    break;
+-  }
+-  case CodeModel::Large: {
+-    // Emit the 5-insn large address load sequence, either directly or
+-    // indirectly in case of going through the GOT, then JIRL_TAIL or
+-    // JIRL_CALL to $addr.
+-    Opcode =
+-        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
+-    Register AddrReg =
+-        IsTailCall
+-            ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass)
+-            : LoongArch::R1;
+-
+-    bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal();
+-    unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO;
+-    unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D;
+-    expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg,
+-                           false);
+-    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0);
+-    break;
+-  }
+-  }
+-
+-  // Transfer implicit operands.
+-  CALL.copyImplicitOps(MI);
+-
+-  // Transfer MI flags.
+-  CALL.setMIFlags(MI.getFlags());
+-
+-  MI.eraseFromParent();
+-  return true;
+-}
+-
+ class LoongArchExpandPseudo : public MachineFunctionPass {
+ public:
+   const LoongArchInstrInfo *TII;
+@@ -533,6 +288,35 @@ private:
+                 MachineBasicBlock::iterator &NextMBBI);
+   bool expandCopyCFR(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                      MachineBasicBlock::iterator &NextMBBI);
++  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
++                              MachineBasicBlock::iterator MBBI,
++                              MachineBasicBlock::iterator &NextMBBI,
++                              unsigned LastOpcode, unsigned IdentifyingMO);
++  bool expandLargeAddressLoad(MachineBasicBlock &MBB,
++                              MachineBasicBlock::iterator MBBI,
++                              MachineBasicBlock::iterator &NextMBBI,
++                              unsigned LastOpcode, unsigned IdentifyingMO,
++                              const MachineOperand &Symbol, Register DestReg,
++                              bool EraseFromParent);
++  bool expandLoadAddressPcrelLarge(MachineBasicBlock &MBB,
++                                   MachineBasicBlock::iterator MBBI,
++                                   MachineBasicBlock::iterator &NextMBBI);
++  bool expandLoadAddressGotLarge(MachineBasicBlock &MBB,
++                                 MachineBasicBlock::iterator MBBI,
++                                 MachineBasicBlock::iterator &NextMBBI);
++  bool expandLoadAddressTLSIELarge(MachineBasicBlock &MBB,
++                                   MachineBasicBlock::iterator MBBI,
++                                   MachineBasicBlock::iterator &NextMBBI);
++  bool expandLoadAddressTLSLDLarge(MachineBasicBlock &MBB,
++                                   MachineBasicBlock::iterator MBBI,
++                                   MachineBasicBlock::iterator &NextMBBI);
++  bool expandLoadAddressTLSGDLarge(MachineBasicBlock &MBB,
++                                   MachineBasicBlock::iterator MBBI,
++                                   MachineBasicBlock::iterator &NextMBBI);
++  bool expandFunctionCALL(MachineBasicBlock &MBB,
++                          MachineBasicBlock::iterator MBBI,
++                          MachineBasicBlock::iterator &NextMBBI,
++                          bool IsTailCall);
+ };
+ 
+ char LoongArchExpandPseudo::ID = 0;
+@@ -567,6 +351,24 @@ bool LoongArchExpandPseudo::expandMI(MachineBasicBlock &MBB,
+   switch (MBBI->getOpcode()) {
+   case LoongArch::PseudoCopyCFR:
+     return expandCopyCFR(MBB, MBBI, NextMBBI);
++  case LoongArch::PseudoLA_PCREL_LARGE:
++    return expandLoadAddressPcrelLarge(MBB, MBBI, NextMBBI);
++  case LoongArch::PseudoLA_GOT_LARGE:
++    return expandLoadAddressGotLarge(MBB, MBBI, NextMBBI);
++  case LoongArch::PseudoLA_TLS_IE_LARGE:
++    return expandLoadAddressTLSIELarge(MBB, MBBI, NextMBBI);
++  case LoongArch::PseudoLA_TLS_LD_LARGE:
++    return expandLoadAddressTLSLDLarge(MBB, MBBI, NextMBBI);
++  case LoongArch::PseudoLA_TLS_GD_LARGE:
++    return expandLoadAddressTLSGDLarge(MBB, MBBI, NextMBBI);
++  case LoongArch::PseudoCALL:
++  case LoongArch::PseudoCALL_MEDIUM:
++  case LoongArch::PseudoCALL_LARGE:
++    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false);
++  case LoongArch::PseudoTAIL:
++  case LoongArch::PseudoTAIL_MEDIUM:
++  case LoongArch::PseudoTAIL_LARGE:
++    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);
+   }
+ 
+   return false;
+@@ -625,6 +427,213 @@ bool LoongArchExpandPseudo::expandCopyCFR(
+   return true;
+ }
+ 
++bool LoongArchExpandPseudo::expandLargeAddressLoad(
++    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
++    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
++    unsigned IdentifyingMO) {
++  MachineInstr &MI = *MBBI;
++  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO,
++                                MI.getOperand(2), MI.getOperand(0).getReg(),
++                                true);
++}
++
++bool LoongArchExpandPseudo::expandLargeAddressLoad(
++    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
++    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode,
++    unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg,
++    bool EraseFromParent) {
++  // Code Sequence:
++  //
++  // Part1: pcalau12i  $dst, %MO1(sym)
++  // Part0: addi.d     $t8, $zero, %MO0(sym)
++  // Part2: lu32i.d    $t8, %MO2(sym)
++  // Part3: lu52i.d    $t8, $t8, %MO3(sym)
++  // Fin:   LastOpcode $dst, $t8, $dst
++
++  unsigned MO0, MO1, MO2, MO3;
++  switch (IdentifyingMO) {
++  default:
++    llvm_unreachable("unsupported identifying MO");
++  case LoongArchII::MO_PCREL_LO:
++    MO0 = IdentifyingMO;
++    MO1 = LoongArchII::MO_PCREL_HI;
++    MO2 = LoongArchII::MO_PCREL64_LO;
++    MO3 = LoongArchII::MO_PCREL64_HI;
++    break;
++  case LoongArchII::MO_GOT_PC_HI:
++  case LoongArchII::MO_LD_PC_HI:
++  case LoongArchII::MO_GD_PC_HI:
++    // These cases relocate just like the GOT case, except for Part1.
++    MO0 = LoongArchII::MO_GOT_PC_LO;
++    MO1 = IdentifyingMO;
++    MO2 = LoongArchII::MO_GOT_PC64_LO;
++    MO3 = LoongArchII::MO_GOT_PC64_HI;
++    break;
++  case LoongArchII::MO_IE_PC_LO:
++    MO0 = IdentifyingMO;
++    MO1 = LoongArchII::MO_IE_PC_HI;
++    MO2 = LoongArchII::MO_IE_PC64_LO;
++    MO3 = LoongArchII::MO_IE_PC64_HI;
++    break;
++  }
++
++  MachineFunction *MF = MBB.getParent();
++  MachineInstr &MI = *MBBI;
++  DebugLoc DL = MI.getDebugLoc();
++  Register ScratchReg = LoongArch::R20; // $t8
++
++  assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() &&
++         "Large code model requires LA64");
++
++  auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), DestReg);
++  auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), ScratchReg)
++                   .addReg(LoongArch::R0);
++  auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), ScratchReg)
++                   // "rj" is needed due to InstrInfo pattern requirement.
++                   .addReg(ScratchReg);
++  auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), ScratchReg)
++                   .addReg(ScratchReg);
++  BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg)
++      .addReg(ScratchReg)
++      .addReg(DestReg);
++
++  if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) {
++    const char *SymName = Symbol.getSymbolName();
++    Part0.addExternalSymbol(SymName, MO0);
++    Part1.addExternalSymbol(SymName, MO1);
++    Part2.addExternalSymbol(SymName, MO2);
++    Part3.addExternalSymbol(SymName, MO3);
++  } else {
++    Part0.addDisp(Symbol, 0, MO0);
++    Part1.addDisp(Symbol, 0, MO1);
++    Part2.addDisp(Symbol, 0, MO2);
++    Part3.addDisp(Symbol, 0, MO3);
++  }
++
++  if (EraseFromParent)
++    MI.eraseFromParent();
++
++  return true;
++}
++
++bool LoongArchExpandPseudo::expandLoadAddressPcrelLarge(
++    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
++    MachineBasicBlock::iterator &NextMBBI) {
++  // Emit the 5-insn large address load sequence with the `%pc` family of
++  // relocs.
++  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
++                                LoongArchII::MO_PCREL_LO);
++}
++
++bool LoongArchExpandPseudo::expandLoadAddressGotLarge(
++    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
++    MachineBasicBlock::iterator &NextMBBI) {
++  // Emit the 5-insn large address load sequence with the `%got_pc` family
++  // of relocs, loading the result from GOT with `ldx.d` in the end.
++  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
++                                LoongArchII::MO_GOT_PC_HI);
++}
++
++bool LoongArchExpandPseudo::expandLoadAddressTLSIELarge(
++    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
++    MachineBasicBlock::iterator &NextMBBI) {
++  // Emit the 5-insn large address load sequence with the `%ie_pc` family
++  // of relocs, loading the result with `ldx.d` in the end.
++  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D,
++                                LoongArchII::MO_IE_PC_LO);
++}
++
++bool LoongArchExpandPseudo::expandLoadAddressTLSLDLarge(
++    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
++    MachineBasicBlock::iterator &NextMBBI) {
++  // Emit the 5-insn large address load sequence with the `%got_pc` family
++  // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`.
++  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
++                                LoongArchII::MO_LD_PC_HI);
++}
++
++bool LoongArchExpandPseudo::expandLoadAddressTLSGDLarge(
++    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
++    MachineBasicBlock::iterator &NextMBBI) {
++  // Emit the 5-insn large address load sequence with the `%got_pc` family
++  // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`.
++  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D,
++                                LoongArchII::MO_GD_PC_HI);
++}
++
++bool LoongArchExpandPseudo::expandFunctionCALL(
++    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
++    MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) {
++  MachineFunction *MF = MBB.getParent();
++  MachineInstr &MI = *MBBI;
++  DebugLoc DL = MI.getDebugLoc();
++  const MachineOperand &Func = MI.getOperand(0);
++  MachineInstrBuilder CALL;
++  unsigned Opcode;
++
++  switch (MF->getTarget().getCodeModel()) {
++  default:
++    report_fatal_error("Unsupported code model");
++    break;
++  case CodeModel::Small: {
++    // CALL:
++    // bl func
++    // TAIL:
++    // b func
++    Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL;
++    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func);
++    break;
++  }
++  case CodeModel::Medium: {
++    // CALL:
++    // pcaddu18i  $ra, %call36(func)
++    // jirl       $ra, $ra, 0
++    // TAIL:
++    // pcaddu18i  $t8, %call36(func)
++    // jr         $t8
++    Opcode =
++        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
++    Register ScratchReg = IsTailCall ? LoongArch::R20 : LoongArch::R1;
++    MachineInstrBuilder MIB =
++        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg);
++
++    CALL =
++        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0);
++
++    if (Func.isSymbol())
++      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36);
++    else
++      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36);
++    break;
++  }
++  case CodeModel::Large: {
++    // Emit the 5-insn large address load sequence, either directly or
++    // indirectly in case of going through the GOT, then JIRL_TAIL or
++    // JIRL_CALL to $addr.
++    Opcode =
++        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL;
++    Register AddrReg = IsTailCall ? LoongArch::R19 : LoongArch::R1;
++
++    bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal();
++    unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO;
++    unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D;
++    expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg,
++                           false);
++    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0);
++    break;
++  }
++  }
++
++  // Transfer implicit operands.
++  CALL.copyImplicitOps(MI);
++
++  // Transfer MI flags.
++  CALL.setMIFlags(MI.getFlags());
++
++  MI.eraseFromParent();
++  return true;
++}
++
+ } // end namespace
+ 
+ INITIALIZE_PASS(LoongArchPreRAExpandPseudo, "loongarch-prera-expand-pseudo",
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index 4fc2b4709840..df1b17649b7d 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -3389,8 +3389,12 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
+ 
+     // TODO: Add more target-dependent nodes later.
+     NODE_NAME_CASE(CALL)
++    NODE_NAME_CASE(CALL_MEDIUM)
++    NODE_NAME_CASE(CALL_LARGE)
+     NODE_NAME_CASE(RET)
+     NODE_NAME_CASE(TAIL)
++    NODE_NAME_CASE(TAIL_MEDIUM)
++    NODE_NAME_CASE(TAIL_LARGE)
+     NODE_NAME_CASE(SLL_W)
+     NODE_NAME_CASE(SRA_W)
+     NODE_NAME_CASE(SRL_W)
+@@ -4248,15 +4252,31 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
+ 
+   // Emit the call.
+   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
++  unsigned Op;
++  switch (DAG.getTarget().getCodeModel()) {
++  default:
++    report_fatal_error("Unsupported code model");
++  case CodeModel::Small:
++    Op = IsTailCall ? LoongArchISD::TAIL : LoongArchISD::CALL;
++    break;
++  case CodeModel::Medium:
++    assert(Subtarget.is64Bit() && "Medium code model requires LA64");
++    Op = IsTailCall ? LoongArchISD::TAIL_MEDIUM : LoongArchISD::CALL_MEDIUM;
++    break;
++  case CodeModel::Large:
++    assert(Subtarget.is64Bit() && "Large code model requires LA64");
++    Op = IsTailCall ? LoongArchISD::TAIL_LARGE : LoongArchISD::CALL_LARGE;
++    break;
++  }
+ 
+   if (IsTailCall) {
+     MF.getFrameInfo().setHasTailCall();
+-    SDValue Ret = DAG.getNode(LoongArchISD::TAIL, DL, NodeTys, Ops);
++    SDValue Ret = DAG.getNode(Op, DL, NodeTys, Ops);
+     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
+     return Ret;
+   }
+ 
+-  Chain = DAG.getNode(LoongArchISD::CALL, DL, NodeTys, Ops);
++  Chain = DAG.getNode(Op, DL, NodeTys, Ops);
+   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
+   Glue = Chain.getValue(1);
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+index 2c9826a13237..a2ed149f4bb7 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+@@ -28,8 +28,12 @@ enum NodeType : unsigned {
+ 
+   // TODO: add more LoongArchISDs
+   CALL,
++  CALL_MEDIUM,
++  CALL_LARGE,
+   RET,
+   TAIL,
++  TAIL_MEDIUM,
++  TAIL_LARGE,
+ 
+   // 32-bit shifts, directly matching the semantics of the named LoongArch
+   // instructions.
+diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+index 67de5f7afd78..ecd0c2b71b85 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+@@ -69,6 +69,18 @@ def loongarch_ret : SDNode<"LoongArchISD::RET", SDTNone,
+ def loongarch_tail : SDNode<"LoongArchISD::TAIL", SDT_LoongArchCall,
+                             [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                              SDNPVariadic]>;
++def loongarch_call_medium : SDNode<"LoongArchISD::CALL_MEDIUM", SDT_LoongArchCall,
++                                   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
++                                    SDNPVariadic]>;
++def loongarch_tail_medium : SDNode<"LoongArchISD::TAIL_MEDIUM", SDT_LoongArchCall,
++                                   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
++                                    SDNPVariadic]>;
++def loongarch_call_large : SDNode<"LoongArchISD::CALL_LARGE", SDT_LoongArchCall,
++                                  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
++                                   SDNPVariadic]>;
++def loongarch_tail_large : SDNode<"LoongArchISD::TAIL_LARGE", SDT_LoongArchCall,
++                                  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
++                                   SDNPVariadic]>;
+ def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>;
+ def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>;
+ def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>;
+@@ -1327,16 +1339,43 @@ def : Pat<(brind GPR:$rj), (PseudoBRIND GPR:$rj, 0)>;
+ def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)),
+           (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>;
+ 
++// Function call with 'Small' code model.
+ let isCall = 1, Defs = [R1] in
+ def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func)>;
+ 
+ def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;
+ def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>;
+ 
++// Function call with 'Medium' code model.
++let isCall = 1, Defs = [R1, R20], Size = 8 in
++def PseudoCALL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$func)>;
++
++let Predicates = [IsLA64] in {
++def : Pat<(loongarch_call_medium tglobaladdr:$func),
++          (PseudoCALL_MEDIUM tglobaladdr:$func)>;
++def : Pat<(loongarch_call_medium texternalsym:$func),
++          (PseudoCALL_MEDIUM texternalsym:$func)>;
++} // Predicates = [IsLA64]
++
++// Function call with 'Large' code model.
++let isCall = 1, Defs = [R1, R20], Size = 24 in
++def PseudoCALL_LARGE: Pseudo<(outs), (ins bare_symbol:$func)>;
++
++let Predicates = [IsLA64] in {
++def : Pat<(loongarch_call_large tglobaladdr:$func),
++          (PseudoCALL_LARGE tglobaladdr:$func)>;
++def : Pat<(loongarch_call_large texternalsym:$func),
++          (PseudoCALL_LARGE texternalsym:$func)>;
++} // Predicates = [IsLA64]
++
+ let isCall = 1, Defs = [R1] in
+ def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rj),
+                                 [(loongarch_call GPR:$rj)]>,
+                          PseudoInstExpansion<(JIRL R1, GPR:$rj, 0)>;
++let Predicates = [IsLA64] in {
++def : Pat<(loongarch_call_medium GPR:$rj), (PseudoCALLIndirect GPR:$rj)>;
++def : Pat<(loongarch_call_large GPR:$rj), (PseudoCALLIndirect GPR:$rj)>;
++}
+ 
+ let isCall = 1, hasSideEffects = 0, mayStore = 0, mayLoad = 0, Defs = [R1] in
+ def PseudoJIRL_CALL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>,
+@@ -1347,6 +1386,7 @@ let isBarrier = 1, isReturn = 1, isTerminator = 1 in
+ def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>,
+                 PseudoInstExpansion<(JIRL R0, R1, 0)>;
+ 
++// Tail call with 'Small' code model.
+ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in
+ def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst)>;
+ 
+@@ -1355,10 +1395,38 @@ def : Pat<(loongarch_tail (iPTR tglobaladdr:$dst)),
+ def : Pat<(loongarch_tail (iPTR texternalsym:$dst)),
+           (PseudoTAIL texternalsym:$dst)>;
+ 
++// Tail call with 'Medium' code model.
++let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
++    Uses = [R3], Defs = [R20], Size = 8 in
++def PseudoTAIL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$dst)>;
++
++let Predicates = [IsLA64] in {
++def : Pat<(loongarch_tail_medium (iPTR tglobaladdr:$dst)),
++          (PseudoTAIL_MEDIUM tglobaladdr:$dst)>;
++def : Pat<(loongarch_tail_medium (iPTR texternalsym:$dst)),
++          (PseudoTAIL_MEDIUM texternalsym:$dst)>;
++} // Predicates = [IsLA64]
++
++// Tail call with 'Large' code model.
++let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
++    Uses = [R3], Defs = [R19, R20], Size = 24 in
++def PseudoTAIL_LARGE : Pseudo<(outs), (ins bare_symbol:$dst)>;
++
++let Predicates = [IsLA64] in {
++def : Pat<(loongarch_tail_large (iPTR tglobaladdr:$dst)),
++          (PseudoTAIL_LARGE tglobaladdr:$dst)>;
++def : Pat<(loongarch_tail_large (iPTR texternalsym:$dst)),
++          (PseudoTAIL_LARGE texternalsym:$dst)>;
++} // Predicates = [IsLA64]
++
+ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in
+ def PseudoTAILIndirect : Pseudo<(outs), (ins GPRT:$rj),
+                                 [(loongarch_tail GPRT:$rj)]>,
+                          PseudoInstExpansion<(JIRL R0, GPR:$rj, 0)>;
++let Predicates = [IsLA64] in {
++def : Pat<(loongarch_tail_medium GPR:$rj), (PseudoTAILIndirect GPR:$rj)>;
++def : Pat<(loongarch_tail_large GPR:$rj), (PseudoTAILIndirect GPR:$rj)>;
++}
+ 
+ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+     hasSideEffects = 0, mayStore = 0, mayLoad = 0, Uses = [R3] in
+@@ -1396,6 +1464,7 @@ def PseudoLA_ABS_LARGE : Pseudo<(outs GPR:$dst),
+                                 "la.abs", "$dst, $src">;
+ def PseudoLA_PCREL : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+                             "la.pcrel", "$dst, $src">;
++let Defs = [R20], Size = 20 in
+ def PseudoLA_PCREL_LARGE : Pseudo<(outs GPR:$dst),
+                                   (ins GPR:$tmp, bare_symbol:$src), [],
+                                   "la.pcrel", "$dst, $tmp, $src">,
+@@ -1407,28 +1476,30 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,
+     isAsmParserOnly = 1 in {
+ def PseudoLA_GOT : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+                           "la.got", "$dst, $src">;
++def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
++                             "la.tls.ie", "$dst, $src">;
++def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
++                             "la.tls.ld", "$dst, $src">;
++def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
++                             "la.tls.gd", "$dst, $src">;
++let Defs = [R20], Size = 20 in {
+ def PseudoLA_GOT_LARGE : Pseudo<(outs GPR:$dst),
+                                 (ins GPR:$tmp, bare_symbol:$src), [],
+                                 "la.got", "$dst, $tmp, $src">,
+                          Requires<[IsLA64]>;
+-def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+-                             "la.tls.ie", "$dst, $src">;
+ def PseudoLA_TLS_IE_LARGE : Pseudo<(outs GPR:$dst),
+                                    (ins GPR:$tmp, bare_symbol:$src), [],
+                                    "la.tls.ie", "$dst, $tmp, $src">,
+                             Requires<[IsLA64]>;
+-def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+-                             "la.tls.ld", "$dst, $src">;
+ def PseudoLA_TLS_LD_LARGE : Pseudo<(outs GPR:$dst),
+                                    (ins GPR:$tmp, bare_symbol:$src), [],
+                                    "la.tls.ld", "$dst, $tmp, $src">,
+                             Requires<[IsLA64]>;
+-def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+-                             "la.tls.gd", "$dst, $src">;
+ def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst),
+                                    (ins GPR:$tmp, bare_symbol:$src), [],
+                                    "la.tls.gd", "$dst, $tmp, $src">,
+                             Requires<[IsLA64]>;
++} // Defs = [R20], Size = 20
+ }
+ 
+ // Load address inst alias: "la", "la.global" and "la.local".
+diff --git a/llvm/test/CodeGen/LoongArch/code-models.ll b/llvm/test/CodeGen/LoongArch/code-models.ll
+index 7c6f46d5e926..f93c31670928 100644
+--- a/llvm/test/CodeGen/LoongArch/code-models.ll
++++ b/llvm/test/CodeGen/LoongArch/code-models.ll
+@@ -33,11 +33,11 @@ define i32 @call_globaladdress(i32 %a) nounwind {
+ ; LARGE:       # %bb.0:
+ ; LARGE-NEXT:    addi.d $sp, $sp, -16
+ ; LARGE-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+-; LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(callee)
+-; LARGE-NEXT:    addi.d $ra, $zero, %got_pc_lo12(callee)
+-; LARGE-NEXT:    lu32i.d $ra, %got64_pc_lo20(callee)
+-; LARGE-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(callee)
+-; LARGE-NEXT:    ldx.d $ra, $ra, $a1
++; LARGE-NEXT:    pcalau12i $ra, %got_pc_hi20(callee)
++; LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(callee)
++; LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(callee)
++; LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(callee)
++; LARGE-NEXT:    ldx.d $ra, $t8, $ra
+ ; LARGE-NEXT:    jirl $ra, $ra, 0
+ ; LARGE-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+ ; LARGE-NEXT:    addi.d $sp, $sp, 16
+@@ -82,11 +82,11 @@ define void @call_external_sym(ptr %dst) {
+ ; LARGE-NEXT:    .cfi_offset 1, -8
+ ; LARGE-NEXT:    ori $a2, $zero, 1000
+ ; LARGE-NEXT:    move $a1, $zero
+-; LARGE-NEXT:    pcalau12i $a3, %pc_hi20(memset)
+-; LARGE-NEXT:    addi.d $ra, $zero, %pc_lo12(memset)
+-; LARGE-NEXT:    lu32i.d $ra, %pc64_lo20(memset)
+-; LARGE-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(memset)
+-; LARGE-NEXT:    add.d $ra, $ra, $a3
++; LARGE-NEXT:    pcalau12i $ra, %pc_hi20(memset)
++; LARGE-NEXT:    addi.d $t8, $zero, %pc_lo12(memset)
++; LARGE-NEXT:    lu32i.d $t8, %pc64_lo20(memset)
++; LARGE-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(memset)
++; LARGE-NEXT:    add.d $ra, $t8, $ra
+ ; LARGE-NEXT:    jirl $ra, $ra, 0
+ ; LARGE-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+ ; LARGE-NEXT:    addi.d $sp, $sp, 16
+@@ -105,17 +105,17 @@ define i32 @caller_tail(i32 %i) nounwind {
+ ;
+ ; MEDIUM-LABEL: caller_tail:
+ ; MEDIUM:       # %bb.0: # %entry
+-; MEDIUM-NEXT:    pcaddu18i $a1, %call36(callee_tail)
+-; MEDIUM-NEXT:    jr $a1
++; MEDIUM-NEXT:    pcaddu18i $t8, %call36(callee_tail)
++; MEDIUM-NEXT:    jr $t8
+ ;
+ ; LARGE-LABEL: caller_tail:
+ ; LARGE:       # %bb.0: # %entry
+-; LARGE-NEXT:    pcalau12i $a1, %got_pc_hi20(callee_tail)
+-; LARGE-NEXT:    addi.d $a2, $zero, %got_pc_lo12(callee_tail)
+-; LARGE-NEXT:    lu32i.d $a2, %got64_pc_lo20(callee_tail)
+-; LARGE-NEXT:    lu52i.d $a2, $a2, %got64_pc_hi12(callee_tail)
+-; LARGE-NEXT:    ldx.d $a1, $a2, $a1
+-; LARGE-NEXT:    jr $a1
++; LARGE-NEXT:    pcalau12i $t7, %got_pc_hi20(callee_tail)
++; LARGE-NEXT:    addi.d $t8, $zero, %got_pc_lo12(callee_tail)
++; LARGE-NEXT:    lu32i.d $t8, %got64_pc_lo20(callee_tail)
++; LARGE-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(callee_tail)
++; LARGE-NEXT:    ldx.d $t7, $t8, $t7
++; LARGE-NEXT:    jr $t7
+ entry:
+   %r = tail call i32 @callee_tail(i32 %i)
+   ret i32 %r
+diff --git a/llvm/test/CodeGen/LoongArch/expand-call.ll b/llvm/test/CodeGen/LoongArch/expand-call.ll
+index 86bf4292665b..e0d179f92de6 100644
+--- a/llvm/test/CodeGen/LoongArch/expand-call.ll
++++ b/llvm/test/CodeGen/LoongArch/expand-call.ll
+@@ -1,6 +1,6 @@
+ ; RUN: llc --mtriple=loongarch64 --stop-before loongarch-prera-expand-pseudo \
+ ; RUN:     --verify-machineinstrs < %s | FileCheck %s --check-prefix=NOEXPAND
+-; RUN: llc --mtriple=loongarch64 --stop-after loongarch-prera-expand-pseudo \
++; RUN: llc --mtriple=loongarch64 --stop-before machine-opt-remark-emitter \
+ ; RUN:     --verify-machineinstrs < %s | FileCheck %s --check-prefix=EXPAND
+ 
+ declare void @callee()
+diff --git a/llvm/test/CodeGen/LoongArch/global-address.ll b/llvm/test/CodeGen/LoongArch/global-address.ll
+index a8f0ef648aa7..d32a17f488b1 100644
+--- a/llvm/test/CodeGen/LoongArch/global-address.ll
++++ b/llvm/test/CodeGen/LoongArch/global-address.ll
+@@ -53,32 +53,32 @@ define void @foo() nounwind {
+ ; LA64LARGENOPIC-LABEL: foo:
+ ; LA64LARGENOPIC:       # %bb.0:
+ ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
++; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
++; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
++; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
++; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
+ ; LA64LARGENOPIC-NEXT:    ld.w $a0, $a0, 0
+ ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %pc_hi20(g)
+-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
+-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %pc64_lo20(g)
+-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
+-; LA64LARGENOPIC-NEXT:    add.d $a0, $a1, $a0
++; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
++; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %pc64_lo20(g)
++; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
++; LA64LARGENOPIC-NEXT:    add.d $a0, $t8, $a0
+ ; LA64LARGENOPIC-NEXT:    ld.w $a0, $a0, 0
+ ; LA64LARGENOPIC-NEXT:    ret
+ ;
+ ; LA64LARGEPIC-LABEL: foo:
+ ; LA64LARGEPIC:       # %bb.0:
+ ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+-; LA64LARGEPIC-NEXT:    ldx.d $a0, $a1, $a0
++; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
++; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
++; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
++; LA64LARGEPIC-NEXT:    ldx.d $a0, $t8, $a0
+ ; LA64LARGEPIC-NEXT:    ld.w $a0, $a0, 0
+ ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
+-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(.Lg$local)
+-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %pc64_lo20(.Lg$local)
+-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Lg$local)
+-; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
++; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(.Lg$local)
++; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(.Lg$local)
++; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(.Lg$local)
++; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
+ ; LA64LARGEPIC-NEXT:    ld.w $a0, $a0, 0
+ ; LA64LARGEPIC-NEXT:    ret
+   %V = load volatile i32, ptr @G
+diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+index a515939b9c2b..474436a0126b 100644
+--- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
++++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+@@ -48,13 +48,13 @@ define void @foo() nounwind {
+ ; MEDIUM_SCH-NEXT:    addi.d $sp, $sp, -16
+ ; MEDIUM_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+ ; MEDIUM_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+-; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
+ ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
+ ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
+ ; MEDIUM_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+ ; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
+ ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, 0
+ ; MEDIUM_SCH-NEXT:    ori $a0, $zero, 1
++; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
+ ; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
+ ; MEDIUM_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+ ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %ie_pc_lo12(gd)
+@@ -74,41 +74,41 @@ define void @foo() nounwind {
+ ; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, -16
+ ; LARGE_NO_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+ ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
++; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
++; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
+ ; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
+ ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
+-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(g)
+-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
+-; LARGE_NO_SCH-NEXT:    add.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
++; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(g)
++; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
++; LARGE_NO_SCH-NEXT:    add.d $a0, $t8, $a0
+ ; LARGE_NO_SCH-NEXT:    ld.d $a0, $a0, 0
+ ; LARGE_NO_SCH-NEXT:    ori $a0, $zero, 1
+-; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
+-; LARGE_NO_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
+-; LARGE_NO_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
+-; LARGE_NO_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
+-; LARGE_NO_SCH-NEXT:    ldx.d $ra, $ra, $a1
++; LARGE_NO_SCH-NEXT:    pcalau12i $ra, %got_pc_hi20(bar)
++; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(bar)
++; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(bar)
++; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(bar)
++; LARGE_NO_SCH-NEXT:    ldx.d $ra, $t8, $ra
+ ; LARGE_NO_SCH-NEXT:    jirl $ra, $ra, 0
+ ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(gd)
+-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(gd)
+-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(gd)
+-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(gd)
++; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(gd)
++; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(gd)
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
+ ; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+ ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
+-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
+-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
+-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
++; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
++; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
+ ; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+ ; LARGE_NO_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+-; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+-; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+-; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+-; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_NO_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
++; LARGE_NO_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
++; LARGE_NO_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
++; LARGE_NO_SCH-NEXT:    ldx.d $a0, $t8, $a0
+ ; LARGE_NO_SCH-NEXT:    ldx.d $a0, $a0, $tp
+ ; LARGE_NO_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+ ; LARGE_NO_SCH-NEXT:    addi.d $sp, $sp, 16
+@@ -118,42 +118,42 @@ define void @foo() nounwind {
+ ; LARGE_SCH:       # %bb.0:
+ ; LARGE_SCH-NEXT:    addi.d $sp, $sp, -16
+ ; LARGE_SCH-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %got_pc_lo12(G)
+ ; LARGE_SCH-NEXT:    pcalau12i $a0, %got_pc_hi20(G)
+-; LARGE_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
+-; LARGE_SCH-NEXT:    lu32i.d $a1, %got64_pc_lo20(G)
+-; LARGE_SCH-NEXT:    lu32i.d $ra, %got64_pc_lo20(bar)
+-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(G)
+-; LARGE_SCH-NEXT:    lu52i.d $ra, $ra, %got64_pc_hi12(bar)
+-; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
+-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
+-; LARGE_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(g)
+-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
++; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(G)
++; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(G)
++; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(G)
++; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
+ ; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
+ ; LARGE_SCH-NEXT:    pcalau12i $a0, %pc_hi20(g)
+-; LARGE_SCH-NEXT:    add.d $a0, $a1, $a0
+-; LARGE_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
++; LARGE_SCH-NEXT:    addi.d $t8, $zero, %pc_lo12(g)
++; LARGE_SCH-NEXT:    lu32i.d $t8, %pc64_lo20(g)
++; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(g)
++; LARGE_SCH-NEXT:    add.d $a0, $t8, $a0
+ ; LARGE_SCH-NEXT:    ld.d $a0, $a0, 0
+-; LARGE_SCH-NEXT:    ldx.d $ra, $ra, $a1
+ ; LARGE_SCH-NEXT:    ori $a0, $zero, 1
++; LARGE_SCH-NEXT:    pcalau12i $ra, %got_pc_hi20(bar)
++; LARGE_SCH-NEXT:    addi.d $t8, $zero, %got_pc_lo12(bar)
++; LARGE_SCH-NEXT:    lu32i.d $t8, %got64_pc_lo20(bar)
++; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(bar)
++; LARGE_SCH-NEXT:    ldx.d $ra, $t8, $ra
+ ; LARGE_SCH-NEXT:    jirl $ra, $ra, 0
+-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(gd)
+ ; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(gd)
+-; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(gd)
+-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(gd)
+-; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
+-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
+-; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
+-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
++; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(gd)
++; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(gd)
++; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(gd)
++; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
+ ; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
+ ; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+-; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
+-; LARGE_SCH-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+-; LARGE_SCH-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+-; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
++; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
++; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
++; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
++; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
+ ; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
+ ; LARGE_SCH-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+-; LARGE_SCH-NEXT:    ldx.d $a0, $a1, $a0
++; LARGE_SCH-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
++; LARGE_SCH-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
++; LARGE_SCH-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
++; LARGE_SCH-NEXT:    ldx.d $a0, $t8, $a0
+ ; LARGE_SCH-NEXT:    ldx.d $a0, $a0, $tp
+ ; LARGE_SCH-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+ ; LARGE_SCH-NEXT:    addi.d $sp, $sp, 16
+diff --git a/llvm/test/CodeGen/LoongArch/tls-models.ll b/llvm/test/CodeGen/LoongArch/tls-models.ll
+index a2a3792a6a54..3994df1da716 100644
+--- a/llvm/test/CodeGen/LoongArch/tls-models.ll
++++ b/llvm/test/CodeGen/LoongArch/tls-models.ll
+@@ -45,15 +45,15 @@ define ptr @f1() nounwind {
+ ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, -16
+ ; LA64LARGEPIC-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+ ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %gd_pc_hi20(unspecified)
+-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(unspecified)
+-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(unspecified)
+-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(unspecified)
+-; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
+-; LA64LARGEPIC-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+-; LA64LARGEPIC-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+-; LA64LARGEPIC-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+-; LA64LARGEPIC-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+-; LA64LARGEPIC-NEXT:    add.d $ra, $ra, $a1
++; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(unspecified)
++; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(unspecified)
++; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(unspecified)
++; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
++; LA64LARGEPIC-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
++; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
++; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
++; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
++; LA64LARGEPIC-NEXT:    add.d $ra, $t8, $ra
+ ; LA64LARGEPIC-NEXT:    jirl $ra, $ra, 0
+ ; LA64LARGEPIC-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+ ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, 16
+@@ -76,10 +76,10 @@ define ptr @f1() nounwind {
+ ; LA64LARGENOPIC-LABEL: f1:
+ ; LA64LARGENOPIC:       # %bb.0: # %entry
+ ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(unspecified)
+-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(unspecified)
+-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(unspecified)
+-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(unspecified)
+-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
++; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(unspecified)
++; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(unspecified)
++; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(unspecified)
++; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
+ ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
+ ; LA64LARGENOPIC-NEXT:    ret
+ entry:
+@@ -116,15 +116,15 @@ define ptr @f2() nounwind {
+ ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, -16
+ ; LA64LARGEPIC-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+ ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %ld_pc_hi20(ld)
+-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %got_pc_lo12(ld)
+-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %got64_pc_lo20(ld)
+-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %got64_pc_hi12(ld)
+-; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
+-; LA64LARGEPIC-NEXT:    pcalau12i $a1, %pc_hi20(__tls_get_addr)
+-; LA64LARGEPIC-NEXT:    addi.d $ra, $zero, %pc_lo12(__tls_get_addr)
+-; LA64LARGEPIC-NEXT:    lu32i.d $ra, %pc64_lo20(__tls_get_addr)
+-; LA64LARGEPIC-NEXT:    lu52i.d $ra, $ra, %pc64_hi12(__tls_get_addr)
+-; LA64LARGEPIC-NEXT:    add.d $ra, $ra, $a1
++; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %got_pc_lo12(ld)
++; LA64LARGEPIC-NEXT:    lu32i.d $t8, %got64_pc_lo20(ld)
++; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %got64_pc_hi12(ld)
++; LA64LARGEPIC-NEXT:    add.d $a0, $t8, $a0
++; LA64LARGEPIC-NEXT:    pcalau12i $ra, %pc_hi20(__tls_get_addr)
++; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %pc_lo12(__tls_get_addr)
++; LA64LARGEPIC-NEXT:    lu32i.d $t8, %pc64_lo20(__tls_get_addr)
++; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %pc64_hi12(__tls_get_addr)
++; LA64LARGEPIC-NEXT:    add.d $ra, $t8, $ra
+ ; LA64LARGEPIC-NEXT:    jirl $ra, $ra, 0
+ ; LA64LARGEPIC-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+ ; LA64LARGEPIC-NEXT:    addi.d $sp, $sp, 16
+@@ -147,10 +147,10 @@ define ptr @f2() nounwind {
+ ; LA64LARGENOPIC-LABEL: f2:
+ ; LA64LARGENOPIC:       # %bb.0: # %entry
+ ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ld)
+-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ld)
+-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ld)
+-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ld)
+-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
++; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ld)
++; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ld)
++; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ld)
++; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
+ ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
+ ; LA64LARGENOPIC-NEXT:    ret
+ entry:
+@@ -177,10 +177,10 @@ define ptr @f3() nounwind {
+ ; LA64LARGEPIC-LABEL: f3:
+ ; LA64LARGEPIC:       # %bb.0: # %entry
+ ; LA64LARGEPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+-; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+-; LA64LARGEPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+-; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+-; LA64LARGEPIC-NEXT:    ldx.d $a0, $a1, $a0
++; LA64LARGEPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
++; LA64LARGEPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
++; LA64LARGEPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
++; LA64LARGEPIC-NEXT:    ldx.d $a0, $t8, $a0
+ ; LA64LARGEPIC-NEXT:    add.d $a0, $a0, $tp
+ ; LA64LARGEPIC-NEXT:    ret
+ ;
+@@ -201,10 +201,10 @@ define ptr @f3() nounwind {
+ ; LA64LARGENOPIC-LABEL: f3:
+ ; LA64LARGENOPIC:       # %bb.0: # %entry
+ ; LA64LARGENOPIC-NEXT:    pcalau12i $a0, %ie_pc_hi20(ie)
+-; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %ie_pc_lo12(ie)
+-; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %ie64_pc_lo20(ie)
+-; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %ie64_pc_hi12(ie)
+-; LA64LARGENOPIC-NEXT:    ldx.d $a0, $a1, $a0
++; LA64LARGENOPIC-NEXT:    addi.d $t8, $zero, %ie_pc_lo12(ie)
++; LA64LARGENOPIC-NEXT:    lu32i.d $t8, %ie64_pc_lo20(ie)
++; LA64LARGENOPIC-NEXT:    lu52i.d $t8, $t8, %ie64_pc_hi12(ie)
++; LA64LARGENOPIC-NEXT:    ldx.d $a0, $t8, $a0
+ ; LA64LARGENOPIC-NEXT:    add.d $a0, $a0, $tp
+ ; LA64LARGENOPIC-NEXT:    ret
+ entry:
+-- 
+2.20.1
+
+
+From 34e8c30579faf4a8ef69fa686bd9b2d9e832d299 Mon Sep 17 00:00:00 2001
+From: Jie Fu <jiefu@tencent.com>
+Date: Fri, 5 Jan 2024 12:05:23 +0800
+Subject: [PATCH 06/12] [LoongArch] Fix -Wunused-variable in
+ LoongArchExpandPseudoInsts.cpp (NFC)
+
+llvm-project/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp:480:20:
+ error: unused variable 'MF' [-Werror,-Wunused-variable]
+  MachineFunction *MF = MBB.getParent();
+                   ^
+1 error generated.
+
+(cherry picked from commit 52d1397e38ee88b170585c9c824d08e6975890ca)
+---
+ llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+index f977f176066a..ad39658f698e 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+@@ -477,12 +477,11 @@ bool LoongArchExpandPseudo::expandLargeAddressLoad(
+     break;
+   }
+ 
+-  MachineFunction *MF = MBB.getParent();
+   MachineInstr &MI = *MBBI;
+   DebugLoc DL = MI.getDebugLoc();
+   Register ScratchReg = LoongArch::R20; // $t8
+ 
+-  assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() &&
++  assert(MBB.getParent()->getSubtarget<LoongArchSubtarget>().is64Bit() &&
+          "Large code model requires LA64");
+ 
+   auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), DestReg);
+-- 
+2.20.1
+
+
+From b5d3aa3ac0dcf98fbb5f8d2d9de295be991c9e8f Mon Sep 17 00:00:00 2001
+From: Zhaoxin Yang <yangzhaoxin@loongson.cn>
+Date: Tue, 23 Jul 2024 12:06:59 +0800
+Subject: [PATCH 07/12] [LoongArch][CodeGen] Implement 128-bit and 256-bit
+ vector shuffle. (#100054)
+
+[LoongArch][CodeGen] Implement 128-bit and 256-bit vector shuffle
+operations.
+
+In LoongArch, shuffle operations can be divided into two types:
+- Single-vector shuffle: Shuffle using only one vector, with the other
+vector being `undef` or not selected by mask. This can be expanded to
+instructions such as `vreplvei` and `vshuf4i`.
+- Two-vector shuffle: Shuflle using two vectors. This can be expanded to
+instructions like `vilv[l/h]`, `vpack[ev/od]`, `vpick[ev/od]` and the
+basic `vshuf`.
+
+In the future, more optimizations may be added, such as handling 1-bit
+vectors and processing single element patterns, etc.
+
+(cherry picked from commit 464ea880cf7710cc8675c83001d7ae020406cf42)
+---
+ .../LoongArch/LoongArchISelLowering.cpp       | 933 +++++++++++++++++-
+ .../Target/LoongArch/LoongArchISelLowering.h  |  10 +
+ .../LoongArch/LoongArchLASXInstrInfo.td       | 130 +++
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td | 148 +++
+ .../lasx/ir-instruction/shuffle-as-xvilv.ll   |  74 ++
+ .../lasx/ir-instruction/shuffle-as-xvpack.ll  | 124 +++
+ .../lasx/ir-instruction/shuffle-as-xvpick.ll  |  84 ++
+ .../ir-instruction/shuffle-as-xvrepl128vei.ll |  65 ++
+ .../lasx/ir-instruction/shuffle-as-xvshuf.ll  |  76 ++
+ .../ir-instruction/shuffle-as-xvshuf4i.ll     |  43 +
+ .../lsx/ir-instruction/shuffle-as-vilv.ll     |  82 ++
+ .../lsx/ir-instruction/shuffle-as-vpack.ll    | 122 +++
+ .../lsx/ir-instruction/shuffle-as-vpick.ll    |  82 ++
+ .../lsx/ir-instruction/shuffle-as-vreplvei.ll |  62 ++
+ .../lsx/ir-instruction/shuffle-as-vshuf.ll    |  84 ++
+ .../lsx/ir-instruction/shuffle-as-vshuf4i.ll  |  42 +
+ 16 files changed, 2158 insertions(+), 3 deletions(-)
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+index df1b17649b7d..618ae7056425 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+@@ -247,9 +247,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+ 
+       setOperationAction(ISD::SETCC, VT, Legal);
+       setOperationAction(ISD::VSELECT, VT, Legal);
++      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+     }
+     for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+-      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
+       setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT,
+                          Legal);
+@@ -293,9 +293,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
+ 
+       setOperationAction(ISD::SETCC, VT, Legal);
+       setOperationAction(ISD::VSELECT, VT, Legal);
++      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+     }
+     for (MVT VT : {MVT::v4i64, MVT::v8i32, MVT::v16i16, MVT::v32i8}) {
+-      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+       setOperationAction({ISD::ADD, ISD::SUB}, VT, Legal);
+       setOperationAction({ISD::UMAX, ISD::UMIN, ISD::SMAX, ISD::SMIN}, VT,
+                          Legal);
+@@ -422,9 +422,926 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
+   return SDValue();
+ }
+ 
++/// Determine whether a range fits a regular pattern of values.
++/// This function accounts for the possibility of jumping over the End iterator.
++template <typename ValType>
++static bool
++fitsRegularPattern(typename SmallVectorImpl<ValType>::const_iterator Begin,
++                   unsigned CheckStride,
++                   typename SmallVectorImpl<ValType>::const_iterator End,
++                   ValType ExpectedIndex, unsigned ExpectedIndexStride) {
++  auto &I = Begin;
++
++  while (I != End) {
++    if (*I != -1 && *I != ExpectedIndex)
++      return false;
++    ExpectedIndex += ExpectedIndexStride;
++
++    // Incrementing past End is undefined behaviour so we must increment one
++    // step at a time and check for End at each step.
++    for (unsigned n = 0; n < CheckStride && I != End; ++n, ++I)
++      ; // Empty loop body.
++  }
++  return true;
++}
++
++/// Lower VECTOR_SHUFFLE into VREPLVEI (if possible).
++///
++/// VREPLVEI performs vector broadcast based on an element specified by an
++/// integer immediate, with its mask being similar to:
++///   <x, x, x, ...>
++/// where x is any valid index.
++///
++/// When undef's appear in the mask they are treated as if they were whatever
++/// value is necessary in order to fit the above form.
++static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask,
++                                            MVT VT, SDValue V1, SDValue V2,
++                                            SelectionDAG &DAG) {
++  int SplatIndex = -1;
++  for (const auto &M : Mask) {
++    if (M != -1) {
++      SplatIndex = M;
++      break;
++    }
++  }
++
++  if (SplatIndex == -1)
++    return DAG.getUNDEF(VT);
++
++  assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index");
++  if (fitsRegularPattern<int>(Mask.begin(), 1, Mask.end(), SplatIndex, 0)) {
++    APInt Imm(64, SplatIndex);
++    return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1,
++                       DAG.getConstant(Imm, DL, MVT::i64));
++  }
++
++  return SDValue();
++}
++
++/// Lower VECTOR_SHUFFLE into VSHUF4I (if possible).
++///
++/// VSHUF4I splits the vector into blocks of four elements, then shuffles these
++/// elements according to a <4 x i2> constant (encoded as an integer immediate).
++///
++/// It is therefore possible to lower into VSHUF4I when the mask takes the form:
++///   <a, b, c, d, a+4, b+4, c+4, d+4, a+8, b+8, c+8, d+8, ...>
++/// When undef's appear they are treated as if they were whatever value is
++/// necessary in order to fit the above forms.
++///
++/// For example:
++///   %2 = shufflevector <8 x i16> %0, <8 x i16> undef,
++///                      <8 x i32> <i32 3, i32 2, i32 1, i32 0,
++///                                 i32 7, i32 6, i32 5, i32 4>
++/// is lowered to:
++///   (VSHUF4I_H $v0, $v1, 27)
++/// where the 27 comes from:
++///   3 + (2 << 2) + (1 << 4) + (0 << 6)
++static SDValue lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
++                                           MVT VT, SDValue V1, SDValue V2,
++                                           SelectionDAG &DAG) {
++
++  // When the size is less than 4, lower cost instructions may be used.
++  if (Mask.size() < 4)
++    return SDValue();
++
++  int SubMask[4] = {-1, -1, -1, -1};
++  for (unsigned i = 0; i < 4; ++i) {
++    for (unsigned j = i; j < Mask.size(); j += 4) {
++      int Idx = Mask[j];
++
++      // Convert from vector index to 4-element subvector index
++      // If an index refers to an element outside of the subvector then give up
++      if (Idx != -1) {
++        Idx -= 4 * (j / 4);
++        if (Idx < 0 || Idx >= 4)
++          return SDValue();
++      }
++
++      // If the mask has an undef, replace it with the current index.
++      // Note that it might still be undef if the current index is also undef
++      if (SubMask[i] == -1)
++        SubMask[i] = Idx;
++      // Check that non-undef values are the same as in the mask. If they
++      // aren't then give up
++      else if (Idx != -1 && Idx != SubMask[i])
++        return SDValue();
++    }
++  }
++
++  // Calculate the immediate. Replace any remaining undefs with zero
++  APInt Imm(64, 0);
++  for (int i = 3; i >= 0; --i) {
++    int Idx = SubMask[i];
++
++    if (Idx == -1)
++      Idx = 0;
++
++    Imm <<= 2;
++    Imm |= Idx & 0x3;
++  }
++
++  return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, V1,
++                     DAG.getConstant(Imm, DL, MVT::i64));
++}
++
++/// Lower VECTOR_SHUFFLE into VPACKEV (if possible).
++///
++/// VPACKEV interleaves the even elements from each vector.
++///
++/// It is possible to lower into VPACKEV when the mask consists of two of the
++/// following forms interleaved:
++///   <0, 2, 4, ...>
++///   <n, n+2, n+4, ...>
++/// where n is the number of elements in the vector.
++/// For example:
++///   <0, 0, 2, 2, 4, 4, ...>
++///   <0, n, 2, n+2, 4, n+4, ...>
++///
++/// When undef's appear in the mask they are treated as if they were whatever
++/// value is necessary in order to fit the above forms.
++static SDValue lowerVECTOR_SHUFFLE_VPACKEV(const SDLoc &DL, ArrayRef<int> Mask,
++                                           MVT VT, SDValue V1, SDValue V2,
++                                           SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &End = Mask.end();
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 2, End, 0, 2))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size(), 2))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 2))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size(), 2))
++    V2 = OriV2;
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VPACKEV, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into VPACKOD (if possible).
++///
++/// VPACKOD interleaves the odd elements from each vector.
++///
++/// It is possible to lower into VPACKOD when the mask consists of two of the
++/// following forms interleaved:
++///   <1, 3, 5, ...>
++///   <n+1, n+3, n+5, ...>
++/// where n is the number of elements in the vector.
++/// For example:
++///   <1, 1, 3, 3, 5, 5, ...>
++///   <1, n+1, 3, n+3, 5, n+5, ...>
++///
++/// When undef's appear in the mask they are treated as if they were whatever
++/// value is necessary in order to fit the above forms.
++static SDValue lowerVECTOR_SHUFFLE_VPACKOD(const SDLoc &DL, ArrayRef<int> Mask,
++                                           MVT VT, SDValue V1, SDValue V2,
++                                           SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &End = Mask.end();
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 2, End, 1, 2))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size() + 1, 2))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(Begin + 1, 2, End, 1, 2))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size() + 1, 2))
++    V2 = OriV2;
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VPACKOD, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into VILVH (if possible).
++///
++/// VILVH interleaves consecutive elements from the left (highest-indexed) half
++/// of each vector.
++///
++/// It is possible to lower into VILVH when the mask consists of two of the
++/// following forms interleaved:
++///   <x, x+1, x+2, ...>
++///   <n+x, n+x+1, n+x+2, ...>
++/// where n is the number of elements in the vector and x is half n.
++/// For example:
++///   <x, x, x+1, x+1, x+2, x+2, ...>
++///   <x, n+x, x+1, n+x+1, x+2, n+x+2, ...>
++///
++/// When undef's appear in the mask they are treated as if they were whatever
++/// value is necessary in order to fit the above forms.
++static SDValue lowerVECTOR_SHUFFLE_VILVH(const SDLoc &DL, ArrayRef<int> Mask,
++                                         MVT VT, SDValue V1, SDValue V2,
++                                         SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &End = Mask.end();
++  unsigned HalfSize = Mask.size() / 2;
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 2, End, HalfSize, 1))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size() + HalfSize, 1))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(Begin + 1, 2, End, HalfSize, 1))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size() + HalfSize,
++                                   1))
++    V2 = OriV2;
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VILVH, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into VILVL (if possible).
++///
++/// VILVL interleaves consecutive elements from the right (lowest-indexed) half
++/// of each vector.
++///
++/// It is possible to lower into VILVL when the mask consists of two of the
++/// following forms interleaved:
++///   <0, 1, 2, ...>
++///   <n, n+1, n+2, ...>
++/// where n is the number of elements in the vector.
++/// For example:
++///   <0, 0, 1, 1, 2, 2, ...>
++///   <0, n, 1, n+1, 2, n+2, ...>
++///
++/// When undef's appear in the mask they are treated as if they were whatever
++/// value is necessary in order to fit the above forms.
++static SDValue lowerVECTOR_SHUFFLE_VILVL(const SDLoc &DL, ArrayRef<int> Mask,
++                                         MVT VT, SDValue V1, SDValue V2,
++                                         SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &End = Mask.end();
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 2, End, 0, 1))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 2, End, Mask.size(), 1))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(Begin + 1, 2, End, 0, 1))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(Begin + 1, 2, End, Mask.size(), 1))
++    V2 = OriV2;
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VILVL, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into VPICKEV (if possible).
++///
++/// VPICKEV copies the even elements of each vector into the result vector.
++///
++/// It is possible to lower into VPICKEV when the mask consists of two of the
++/// following forms concatenated:
++///   <0, 2, 4, ...>
++///   <n, n+2, n+4, ...>
++/// where n is the number of elements in the vector.
++/// For example:
++///   <0, 2, 4, ..., 0, 2, 4, ...>
++///   <0, 2, 4, ..., n, n+2, n+4, ...>
++///
++/// When undef's appear in the mask they are treated as if they were whatever
++/// value is necessary in order to fit the above forms.
++static SDValue lowerVECTOR_SHUFFLE_VPICKEV(const SDLoc &DL, ArrayRef<int> Mask,
++                                           MVT VT, SDValue V1, SDValue V2,
++                                           SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &Mid = Mask.begin() + Mask.size() / 2;
++  const auto &End = Mask.end();
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 1, Mid, 0, 2))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 1, Mid, Mask.size(), 2))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(Mid, 1, End, 0, 2))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(Mid, 1, End, Mask.size(), 2))
++    V2 = OriV2;
++
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VPICKEV, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into VPICKOD (if possible).
++///
++/// VPICKOD copies the odd elements of each vector into the result vector.
++///
++/// It is possible to lower into VPICKOD when the mask consists of two of the
++/// following forms concatenated:
++///   <1, 3, 5, ...>
++///   <n+1, n+3, n+5, ...>
++/// where n is the number of elements in the vector.
++/// For example:
++///   <1, 3, 5, ..., 1, 3, 5, ...>
++///   <1, 3, 5, ..., n+1, n+3, n+5, ...>
++///
++/// When undef's appear in the mask they are treated as if they were whatever
++/// value is necessary in order to fit the above forms.
++static SDValue lowerVECTOR_SHUFFLE_VPICKOD(const SDLoc &DL, ArrayRef<int> Mask,
++                                           MVT VT, SDValue V1, SDValue V2,
++                                           SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &Mid = Mask.begin() + Mask.size() / 2;
++  const auto &End = Mask.end();
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 1, Mid, 1, 2))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 1, Mid, Mask.size() + 1, 2))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(Mid, 1, End, 1, 2))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(Mid, 1, End, Mask.size() + 1, 2))
++    V2 = OriV2;
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VPICKOD, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into VSHUF.
++///
++/// This mostly consists of converting the shuffle mask into a BUILD_VECTOR and
++/// adding it as an operand to the resulting VSHUF.
++static SDValue lowerVECTOR_SHUFFLE_VSHUF(const SDLoc &DL, ArrayRef<int> Mask,
++                                         MVT VT, SDValue V1, SDValue V2,
++                                         SelectionDAG &DAG) {
++
++  SmallVector<SDValue, 16> Ops;
++  for (auto M : Mask)
++    Ops.push_back(DAG.getConstant(M, DL, MVT::i64));
++
++  EVT MaskVecTy = VT.changeVectorElementTypeToInteger();
++  SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, Ops);
++
++  // VECTOR_SHUFFLE concatenates the vectors in an vectorwise fashion.
++  // <0b00, 0b01> + <0b10, 0b11> -> <0b00, 0b01, 0b10, 0b11>
++  // VSHF concatenates the vectors in a bitwise fashion:
++  // <0b00, 0b01> + <0b10, 0b11> ->
++  // 0b0100       + 0b1110       -> 0b01001110
++  //                                <0b10, 0b11, 0b00, 0b01>
++  // We must therefore swap the operands to get the correct result.
++  return DAG.getNode(LoongArchISD::VSHUF, DL, VT, MaskVec, V2, V1);
++}
++
++/// Dispatching routine to lower various 128-bit LoongArch vector shuffles.
++///
++/// This routine breaks down the specific type of 128-bit shuffle and
++/// dispatches to the lowering routines accordingly.
++static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
++                                  SDValue V1, SDValue V2, SelectionDAG &DAG) {
++  assert((VT.SimpleTy == MVT::v16i8 || VT.SimpleTy == MVT::v8i16 ||
++          VT.SimpleTy == MVT::v4i32 || VT.SimpleTy == MVT::v2i64 ||
++          VT.SimpleTy == MVT::v4f32 || VT.SimpleTy == MVT::v2f64) &&
++         "Vector type is unsupported for lsx!");
++  assert(V1.getSimpleValueType() == V2.getSimpleValueType() &&
++         "Two operands have different types!");
++  assert(VT.getVectorNumElements() == Mask.size() &&
++         "Unexpected mask size for shuffle!");
++  assert(Mask.size() % 2 == 0 && "Expected even mask size.");
++
++  SDValue Result;
++  // TODO: Add more comparison patterns.
++  if (V2.isUndef()) {
++    if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG)))
++      return Result;
++    if ((Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG)))
++      return Result;
++
++    // TODO: This comment may be enabled in the future to better match the
++    // pattern for instruction selection.
++    /* V2 = V1; */
++  }
++
++  // It is recommended not to change the pattern comparison order for better
++  // performance.
++  if ((Result = lowerVECTOR_SHUFFLE_VPACKEV(DL, Mask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_VPACKOD(DL, Mask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_VILVH(DL, Mask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_VILVL(DL, Mask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_VPICKEV(DL, Mask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG)))
++    return Result;
++
++  return SDValue();
++}
++
++/// Lower VECTOR_SHUFFLE into XVREPLVEI (if possible).
++///
++/// It is a XVREPLVEI when the mask is:
++///   <x, x, x, ..., x+n, x+n, x+n, ...>
++/// where the number of x is equal to n and n is half the length of vector.
++///
++/// When undef's appear in the mask they are treated as if they were whatever
++/// value is necessary in order to fit the above form.
++static SDValue lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL,
++                                             ArrayRef<int> Mask, MVT VT,
++                                             SDValue V1, SDValue V2,
++                                             SelectionDAG &DAG) {
++  int SplatIndex = -1;
++  for (const auto &M : Mask) {
++    if (M != -1) {
++      SplatIndex = M;
++      break;
++    }
++  }
++
++  if (SplatIndex == -1)
++    return DAG.getUNDEF(VT);
++
++  const auto &Begin = Mask.begin();
++  const auto &End = Mask.end();
++  unsigned HalfSize = Mask.size() / 2;
++
++  assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index");
++  if (fitsRegularPattern<int>(Begin, 1, End - HalfSize, SplatIndex, 0) &&
++      fitsRegularPattern<int>(Begin + HalfSize, 1, End, SplatIndex + HalfSize,
++                              0)) {
++    APInt Imm(64, SplatIndex);
++    return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1,
++                       DAG.getConstant(Imm, DL, MVT::i64));
++  }
++
++  return SDValue();
++}
++
++/// Lower VECTOR_SHUFFLE into XVSHUF4I (if possible).
++static SDValue lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask,
++                                            MVT VT, SDValue V1, SDValue V2,
++                                            SelectionDAG &DAG) {
++  // When the size is less than or equal to 4, lower cost instructions may be
++  // used.
++  if (Mask.size() <= 4)
++    return SDValue();
++  return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG);
++}
++
++/// Lower VECTOR_SHUFFLE into XVPACKEV (if possible).
++static SDValue lowerVECTOR_SHUFFLE_XVPACKEV(const SDLoc &DL, ArrayRef<int> Mask,
++                                            MVT VT, SDValue V1, SDValue V2,
++                                            SelectionDAG &DAG) {
++  return lowerVECTOR_SHUFFLE_VPACKEV(DL, Mask, VT, V1, V2, DAG);
++}
++
++/// Lower VECTOR_SHUFFLE into XVPACKOD (if possible).
++static SDValue lowerVECTOR_SHUFFLE_XVPACKOD(const SDLoc &DL, ArrayRef<int> Mask,
++                                            MVT VT, SDValue V1, SDValue V2,
++                                            SelectionDAG &DAG) {
++  return lowerVECTOR_SHUFFLE_VPACKOD(DL, Mask, VT, V1, V2, DAG);
++}
++
++/// Lower VECTOR_SHUFFLE into XVILVH (if possible).
++static SDValue lowerVECTOR_SHUFFLE_XVILVH(const SDLoc &DL, ArrayRef<int> Mask,
++                                          MVT VT, SDValue V1, SDValue V2,
++                                          SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &End = Mask.end();
++  unsigned HalfSize = Mask.size() / 2;
++  unsigned LeftSize = HalfSize / 2;
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, HalfSize - LeftSize,
++                              1) &&
++      fitsRegularPattern<int>(Begin + HalfSize, 2, End, HalfSize + LeftSize, 1))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 2, End - HalfSize,
++                                   Mask.size() + HalfSize - LeftSize, 1) &&
++           fitsRegularPattern<int>(Begin + HalfSize, 2, End,
++                                   Mask.size() + HalfSize + LeftSize, 1))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, HalfSize - LeftSize,
++                              1) &&
++      fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End, HalfSize + LeftSize,
++                              1))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize,
++                                   Mask.size() + HalfSize - LeftSize, 1) &&
++           fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End,
++                                   Mask.size() + HalfSize + LeftSize, 1))
++    V2 = OriV2;
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VILVH, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into XVILVL (if possible).
++static SDValue lowerVECTOR_SHUFFLE_XVILVL(const SDLoc &DL, ArrayRef<int> Mask,
++                                          MVT VT, SDValue V1, SDValue V2,
++                                          SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &End = Mask.end();
++  unsigned HalfSize = Mask.size() / 2;
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, 0, 1) &&
++      fitsRegularPattern<int>(Begin + HalfSize, 2, End, HalfSize, 1))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 2, End - HalfSize, Mask.size(), 1) &&
++           fitsRegularPattern<int>(Begin + HalfSize, 2, End,
++                                   Mask.size() + HalfSize, 1))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, 0, 1) &&
++      fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End, HalfSize, 1))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(Begin + 1, 2, End - HalfSize, Mask.size(),
++                                   1) &&
++           fitsRegularPattern<int>(Begin + 1 + HalfSize, 2, End,
++                                   Mask.size() + HalfSize, 1))
++    V2 = OriV2;
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VILVL, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into XVPICKEV (if possible).
++static SDValue lowerVECTOR_SHUFFLE_XVPICKEV(const SDLoc &DL, ArrayRef<int> Mask,
++                                            MVT VT, SDValue V1, SDValue V2,
++                                            SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &LeftMid = Mask.begin() + Mask.size() / 4;
++  const auto &Mid = Mask.begin() + Mask.size() / 2;
++  const auto &RightMid = Mask.end() - Mask.size() / 4;
++  const auto &End = Mask.end();
++  unsigned HalfSize = Mask.size() / 2;
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 1, LeftMid, 0, 2) &&
++      fitsRegularPattern<int>(Mid, 1, RightMid, HalfSize, 2))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 1, LeftMid, Mask.size(), 2) &&
++           fitsRegularPattern<int>(Mid, 1, RightMid, Mask.size() + HalfSize, 2))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(LeftMid, 1, Mid, 0, 2) &&
++      fitsRegularPattern<int>(RightMid, 1, End, HalfSize, 2))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(LeftMid, 1, Mid, Mask.size(), 2) &&
++           fitsRegularPattern<int>(RightMid, 1, End, Mask.size() + HalfSize, 2))
++    V2 = OriV2;
++
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VPICKEV, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into XVPICKOD (if possible).
++static SDValue lowerVECTOR_SHUFFLE_XVPICKOD(const SDLoc &DL, ArrayRef<int> Mask,
++                                            MVT VT, SDValue V1, SDValue V2,
++                                            SelectionDAG &DAG) {
++
++  const auto &Begin = Mask.begin();
++  const auto &LeftMid = Mask.begin() + Mask.size() / 4;
++  const auto &Mid = Mask.begin() + Mask.size() / 2;
++  const auto &RightMid = Mask.end() - Mask.size() / 4;
++  const auto &End = Mask.end();
++  unsigned HalfSize = Mask.size() / 2;
++  SDValue OriV1 = V1, OriV2 = V2;
++
++  if (fitsRegularPattern<int>(Begin, 1, LeftMid, 1, 2) &&
++      fitsRegularPattern<int>(Mid, 1, RightMid, HalfSize + 1, 2))
++    V1 = OriV1;
++  else if (fitsRegularPattern<int>(Begin, 1, LeftMid, Mask.size() + 1, 2) &&
++           fitsRegularPattern<int>(Mid, 1, RightMid, Mask.size() + HalfSize + 1,
++                                   2))
++    V1 = OriV2;
++  else
++    return SDValue();
++
++  if (fitsRegularPattern<int>(LeftMid, 1, Mid, 1, 2) &&
++      fitsRegularPattern<int>(RightMid, 1, End, HalfSize + 1, 2))
++    V2 = OriV1;
++  else if (fitsRegularPattern<int>(LeftMid, 1, Mid, Mask.size() + 1, 2) &&
++           fitsRegularPattern<int>(RightMid, 1, End, Mask.size() + HalfSize + 1,
++                                   2))
++    V2 = OriV2;
++  else
++    return SDValue();
++
++  return DAG.getNode(LoongArchISD::VPICKOD, DL, VT, V2, V1);
++}
++
++/// Lower VECTOR_SHUFFLE into XVSHUF (if possible).
++static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef<int> Mask,
++                                          MVT VT, SDValue V1, SDValue V2,
++                                          SelectionDAG &DAG) {
++
++  int MaskSize = Mask.size();
++  int HalfSize = Mask.size() / 2;
++  const auto &Begin = Mask.begin();
++  const auto &Mid = Mask.begin() + HalfSize;
++  const auto &End = Mask.end();
++
++  // VECTOR_SHUFFLE concatenates the vectors:
++  //  <0, 1, 2, 3, 4, 5, 6, 7> + <8, 9, 10, 11, 12, 13, 14, 15>
++  //  shuffling ->
++  //  <0, 1, 2, 3, 8, 9, 10, 11> <4, 5, 6, 7, 12, 13, 14, 15>
++  //
++  // XVSHUF concatenates the vectors:
++  //  <a0, a1, a2, a3, b0, b1, b2, b3> + <a4, a5, a6, a7, b4, b5, b6, b7>
++  //  shuffling ->
++  //  <a0, a1, a2, a3, a4, a5, a6, a7> + <b0, b1, b2, b3, b4, b5, b6, b7>
++  SmallVector<SDValue, 8> MaskAlloc;
++  for (auto it = Begin; it < Mid; it++) {
++    if (*it < 0) // UNDEF
++      MaskAlloc.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
++    else if ((*it >= 0 && *it < HalfSize) ||
++             (*it >= MaskSize && *it <= MaskSize + HalfSize)) {
++      int M = *it < HalfSize ? *it : *it - HalfSize;
++      MaskAlloc.push_back(DAG.getTargetConstant(M, DL, MVT::i64));
++    } else
++      return SDValue();
++  }
++  assert((int)MaskAlloc.size() == HalfSize && "xvshuf convert failed!");
++
++  for (auto it = Mid; it < End; it++) {
++    if (*it < 0) // UNDEF
++      MaskAlloc.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
++    else if ((*it >= HalfSize && *it < MaskSize) ||
++             (*it >= MaskSize + HalfSize && *it < MaskSize * 2)) {
++      int M = *it < MaskSize ? *it - HalfSize : *it - MaskSize;
++      MaskAlloc.push_back(DAG.getTargetConstant(M, DL, MVT::i64));
++    } else
++      return SDValue();
++  }
++  assert((int)MaskAlloc.size() == MaskSize && "xvshuf convert failed!");
++
++  EVT MaskVecTy = VT.changeVectorElementTypeToInteger();
++  SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, MaskAlloc);
++  return DAG.getNode(LoongArchISD::VSHUF, DL, VT, MaskVec, V2, V1);
++}
++
++/// Shuffle vectors by lane to generate more optimized instructions.
++/// 256-bit shuffles are always considered as 2-lane 128-bit shuffles.
++///
++/// Therefore, except for the following four cases, other cases are regarded
++/// as cross-lane shuffles, where optimization is relatively limited.
++///
++/// - Shuffle high, low lanes of two inputs vector
++///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <0, 5, 3, 6>
++/// - Shuffle low, high lanes of two inputs vector
++///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <3, 6, 0, 5>
++/// - Shuffle low, low lanes of two inputs vector
++///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <3, 6, 3, 6>
++/// - Shuffle high, high lanes of two inputs vector
++///   <0, 1, 2, 3> + <4, 5, 6, 7> --- <0, 5, 0, 5>
++///
++/// The first case is the closest to LoongArch instructions and the other
++/// cases need to be converted to it for processing.
++///
++/// This function may modify V1, V2 and Mask
++static void canonicalizeShuffleVectorByLane(const SDLoc &DL,
++                                            MutableArrayRef<int> Mask, MVT VT,
++                                            SDValue &V1, SDValue &V2,
++                                            SelectionDAG &DAG) {
++
++  enum HalfMaskType { HighLaneTy, LowLaneTy, None };
++
++  int MaskSize = Mask.size();
++  int HalfSize = Mask.size() / 2;
++
++  HalfMaskType preMask = None, postMask = None;
++
++  if (std::all_of(Mask.begin(), Mask.begin() + HalfSize, [&](int M) {
++        return M < 0 || (M >= 0 && M < HalfSize) ||
++               (M >= MaskSize && M < MaskSize + HalfSize);
++      }))
++    preMask = HighLaneTy;
++  else if (std::all_of(Mask.begin(), Mask.begin() + HalfSize, [&](int M) {
++             return M < 0 || (M >= HalfSize && M < MaskSize) ||
++                    (M >= MaskSize + HalfSize && M < MaskSize * 2);
++           }))
++    preMask = LowLaneTy;
++
++  if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) {
++        return M < 0 || (M >= 0 && M < HalfSize) ||
++               (M >= MaskSize && M < MaskSize + HalfSize);
++      }))
++    postMask = HighLaneTy;
++  else if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) {
++             return M < 0 || (M >= HalfSize && M < MaskSize) ||
++                    (M >= MaskSize + HalfSize && M < MaskSize * 2);
++           }))
++    postMask = LowLaneTy;
++
++  // The pre-half of mask is high lane type, and the post-half of mask
++  // is low lane type, which is closest to the LoongArch instructions.
++  //
++  // Note: In the LoongArch architecture, the high lane of mask corresponds
++  // to the lower 128-bit of vector register, and the low lane of mask
++  // corresponds the higher 128-bit of vector register.
++  if (preMask == HighLaneTy && postMask == LowLaneTy) {
++    return;
++  }
++  if (preMask == LowLaneTy && postMask == HighLaneTy) {
++    V1 = DAG.getBitcast(MVT::v4i64, V1);
++    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
++                     DAG.getConstant(0b01001110, DL, MVT::i64));
++    V1 = DAG.getBitcast(VT, V1);
++
++    if (!V2.isUndef()) {
++      V2 = DAG.getBitcast(MVT::v4i64, V2);
++      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
++                       DAG.getConstant(0b01001110, DL, MVT::i64));
++      V2 = DAG.getBitcast(VT, V2);
++    }
++
++    for (auto it = Mask.begin(); it < Mask.begin() + HalfSize; it++) {
++      *it = *it < 0 ? *it : *it - HalfSize;
++    }
++    for (auto it = Mask.begin() + HalfSize; it < Mask.end(); it++) {
++      *it = *it < 0 ? *it : *it + HalfSize;
++    }
++  } else if (preMask == LowLaneTy && postMask == LowLaneTy) {
++    V1 = DAG.getBitcast(MVT::v4i64, V1);
++    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
++                     DAG.getConstant(0b11101110, DL, MVT::i64));
++    V1 = DAG.getBitcast(VT, V1);
++
++    if (!V2.isUndef()) {
++      V2 = DAG.getBitcast(MVT::v4i64, V2);
++      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
++                       DAG.getConstant(0b11101110, DL, MVT::i64));
++      V2 = DAG.getBitcast(VT, V2);
++    }
++
++    for (auto it = Mask.begin(); it < Mask.begin() + HalfSize; it++) {
++      *it = *it < 0 ? *it : *it - HalfSize;
++    }
++  } else if (preMask == HighLaneTy && postMask == HighLaneTy) {
++    V1 = DAG.getBitcast(MVT::v4i64, V1);
++    V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1,
++                     DAG.getConstant(0b01000100, DL, MVT::i64));
++    V1 = DAG.getBitcast(VT, V1);
++
++    if (!V2.isUndef()) {
++      V2 = DAG.getBitcast(MVT::v4i64, V2);
++      V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2,
++                       DAG.getConstant(0b01000100, DL, MVT::i64));
++      V2 = DAG.getBitcast(VT, V2);
++    }
++
++    for (auto it = Mask.begin() + HalfSize; it < Mask.end(); it++) {
++      *it = *it < 0 ? *it : *it + HalfSize;
++    }
++  } else { // cross-lane
++    return;
++  }
++}
++
++/// Dispatching routine to lower various 256-bit LoongArch vector shuffles.
++///
++/// This routine breaks down the specific type of 256-bit shuffle and
++/// dispatches to the lowering routines accordingly.
++static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
++                                  SDValue V1, SDValue V2, SelectionDAG &DAG) {
++  assert((VT.SimpleTy == MVT::v32i8 || VT.SimpleTy == MVT::v16i16 ||
++          VT.SimpleTy == MVT::v8i32 || VT.SimpleTy == MVT::v4i64 ||
++          VT.SimpleTy == MVT::v8f32 || VT.SimpleTy == MVT::v4f64) &&
++         "Vector type is unsupported for lasx!");
++  assert(V1.getSimpleValueType() == V2.getSimpleValueType() &&
++         "Two operands have different types!");
++  assert(VT.getVectorNumElements() == Mask.size() &&
++         "Unexpected mask size for shuffle!");
++  assert(Mask.size() % 2 == 0 && "Expected even mask size.");
++  assert(Mask.size() >= 4 && "Mask size is less than 4.");
++
++  // canonicalize non cross-lane shuffle vector
++  SmallVector<int> NewMask(Mask);
++  canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG);
++
++  SDValue Result;
++  // TODO: Add more comparison patterns.
++  if (V2.isUndef()) {
++    if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG)))
++      return Result;
++    if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG)))
++      return Result;
++
++    // TODO: This comment may be enabled in the future to better match the
++    // pattern for instruction selection.
++    /* V2 = V1; */
++  }
++
++  // It is recommended not to change the pattern comparison order for better
++  // performance.
++  if ((Result = lowerVECTOR_SHUFFLE_XVPACKEV(DL, NewMask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_XVPACKOD(DL, NewMask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_XVILVH(DL, NewMask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_XVILVL(DL, NewMask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_XVPICKEV(DL, NewMask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, NewMask, VT, V1, V2, DAG)))
++    return Result;
++  if ((Result = lowerVECTOR_SHUFFLE_XVSHUF(DL, NewMask, VT, V1, V2, DAG)))
++    return Result;
++
++  return SDValue();
++}
++
+ SDValue LoongArchTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+-  // TODO: custom shuffle.
++  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
++  ArrayRef<int> OrigMask = SVOp->getMask();
++  SDValue V1 = Op.getOperand(0);
++  SDValue V2 = Op.getOperand(1);
++  MVT VT = Op.getSimpleValueType();
++  int NumElements = VT.getVectorNumElements();
++  SDLoc DL(Op);
++
++  bool V1IsUndef = V1.isUndef();
++  bool V2IsUndef = V2.isUndef();
++  if (V1IsUndef && V2IsUndef)
++    return DAG.getUNDEF(VT);
++
++  // When we create a shuffle node we put the UNDEF node to second operand,
++  // but in some cases the first operand may be transformed to UNDEF.
++  // In this case we should just commute the node.
++  if (V1IsUndef)
++    return DAG.getCommutedVectorShuffle(*SVOp);
++
++  // Check for non-undef masks pointing at an undef vector and make the masks
++  // undef as well. This makes it easier to match the shuffle based solely on
++  // the mask.
++  if (V2IsUndef &&
++      any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
++    SmallVector<int, 8> NewMask(OrigMask);
++    for (int &M : NewMask)
++      if (M >= NumElements)
++        M = -1;
++    return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
++  }
++
++  // Check for illegal shuffle mask element index values.
++  int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
++  (void)MaskUpperLimit;
++  assert(llvm::all_of(OrigMask,
++                      [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
++         "Out of bounds shuffle index");
++
++  // For each vector width, delegate to a specialized lowering routine.
++  if (VT.is128BitVector())
++    return lower128BitShuffle(DL, OrigMask, VT, V1, V2, DAG);
++
++  if (VT.is256BitVector())
++    return lower256BitShuffle(DL, OrigMask, VT, V1, V2, DAG);
++
+   return SDValue();
+ }
+ 
+@@ -3439,6 +4356,16 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
+     NODE_NAME_CASE(MOVFCSR2GR)
+     NODE_NAME_CASE(CACOP_D)
+     NODE_NAME_CASE(CACOP_W)
++    NODE_NAME_CASE(VSHUF)
++    NODE_NAME_CASE(VPICKEV)
++    NODE_NAME_CASE(VPICKOD)
++    NODE_NAME_CASE(VPACKEV)
++    NODE_NAME_CASE(VPACKOD)
++    NODE_NAME_CASE(VILVL)
++    NODE_NAME_CASE(VILVH)
++    NODE_NAME_CASE(VSHUF4I)
++    NODE_NAME_CASE(VREPLVEI)
++    NODE_NAME_CASE(XVPERMI)
+     NODE_NAME_CASE(VPICK_SEXT_ELT)
+     NODE_NAME_CASE(VPICK_ZEXT_ELT)
+     NODE_NAME_CASE(VREPLVE)
+diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+index a2ed149f4bb7..a5ee740c1261 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
++++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+@@ -117,6 +117,16 @@ enum NodeType : unsigned {
+ 
+   // Vector Shuffle
+   VREPLVE,
++  VSHUF,
++  VPICKEV,
++  VPICKOD,
++  VPACKEV,
++  VPACKOD,
++  VILVL,
++  VILVH,
++  VSHUF4I,
++  VREPLVEI,
++  XVPERMI,
+ 
+   // Extended vector element extraction
+   VPICK_SEXT_ELT,
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index 492b62da6ce7..5b6721cdf1b4 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -10,6 +10,8 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_loongArchV1RUimm>;
++
+ def lasxsplati8
+   : PatFrag<(ops node:$e0),
+             (v32i8 (build_vector node:$e0, node:$e0, node:$e0, node:$e0,
+@@ -1571,6 +1573,134 @@ def : Pat<(loongarch_vreplve v8i32:$xj, GRLenVT:$rk),
+ def : Pat<(loongarch_vreplve v4i64:$xj, GRLenVT:$rk),
+           (XVREPLVE_D v4i64:$xj, GRLenVT:$rk)>;
+ 
++// XVSHUF_{B/H/W/D}
++def : Pat<(loongarch_vshuf v32i8:$xa, v32i8:$xj, v32i8:$xk),
++          (XVSHUF_B v32i8:$xj, v32i8:$xk, v32i8:$xa)>;
++def : Pat<(loongarch_vshuf v16i16:$xd, v16i16:$xj, v16i16:$xk),
++          (XVSHUF_H v16i16:$xd, v16i16:$xj, v16i16:$xk)>;
++def : Pat<(loongarch_vshuf v8i32:$xd, v8i32:$xj, v8i32:$xk),
++          (XVSHUF_W v8i32:$xd, v8i32:$xj, v8i32:$xk)>;
++def : Pat<(loongarch_vshuf v4i64:$xd, v4i64:$xj, v4i64:$xk),
++          (XVSHUF_D v4i64:$xd, v4i64:$xj, v4i64:$xk)>;
++def : Pat<(loongarch_vshuf v8i32:$xd, v8f32:$xj, v8f32:$xk),
++          (XVSHUF_W v8i32:$xd, v8f32:$xj, v8f32:$xk)>;
++def : Pat<(loongarch_vshuf v4i64:$xd, v4f64:$xj, v4f64:$xk),
++          (XVSHUF_D v4i64:$xd, v4f64:$xj, v4f64:$xk)>;
++
++// XVPICKEV_{B/H/W/D}
++def : Pat<(loongarch_vpickev v32i8:$xj, v32i8:$xk),
++          (XVPICKEV_B v32i8:$xj, v32i8:$xk)>;
++def : Pat<(loongarch_vpickev v16i16:$xj, v16i16:$xk),
++          (XVPICKEV_H v16i16:$xj, v16i16:$xk)>;
++def : Pat<(loongarch_vpickev v8i32:$xj, v8i32:$xk),
++          (XVPICKEV_W v8i32:$xj, v8i32:$xk)>;
++def : Pat<(loongarch_vpickev v4i64:$xj, v4i64:$xk),
++          (XVPICKEV_D v4i64:$xj, v4i64:$xk)>;
++def : Pat<(loongarch_vpickev v8f32:$xj, v8f32:$xk),
++          (XVPICKEV_W v8f32:$xj, v8f32:$xk)>;
++def : Pat<(loongarch_vpickev v4f64:$xj, v4f64:$xk),
++          (XVPICKEV_D v4f64:$xj, v4f64:$xk)>;
++
++// XVPICKOD_{B/H/W/D}
++def : Pat<(loongarch_vpickod v32i8:$xj, v32i8:$xk),
++          (XVPICKOD_B v32i8:$xj, v32i8:$xk)>;
++def : Pat<(loongarch_vpickod v16i16:$xj, v16i16:$xk),
++          (XVPICKOD_H v16i16:$xj, v16i16:$xk)>;
++def : Pat<(loongarch_vpickod v8i32:$xj, v8i32:$xk),
++          (XVPICKOD_W v8i32:$xj, v8i32:$xk)>;
++def : Pat<(loongarch_vpickod v4i64:$xj, v4i64:$xk),
++          (XVPICKOD_D v4i64:$xj, v4i64:$xk)>;
++def : Pat<(loongarch_vpickod v8f32:$xj, v8f32:$xk),
++          (XVPICKOD_W v8f32:$xj, v8f32:$xk)>;
++def : Pat<(loongarch_vpickod v4f64:$xj, v4f64:$xk),
++          (XVPICKOD_D v4f64:$xj, v4f64:$xk)>;
++
++// XVPACKEV_{B/H/W/D}
++def : Pat<(loongarch_vpackev v32i8:$xj, v32i8:$xk),
++          (XVPACKEV_B v32i8:$xj, v32i8:$xk)>;
++def : Pat<(loongarch_vpackev v16i16:$xj, v16i16:$xk),
++          (XVPACKEV_H v16i16:$xj, v16i16:$xk)>;
++def : Pat<(loongarch_vpackev v8i32:$xj, v8i32:$xk),
++          (XVPACKEV_W v8i32:$xj, v8i32:$xk)>;
++def : Pat<(loongarch_vpackev v4i64:$xj, v4i64:$xk),
++          (XVPACKEV_D v4i64:$xj, v4i64:$xk)>;
++def : Pat<(loongarch_vpackev v8f32:$xj, v8f32:$xk),
++          (XVPACKEV_W v8f32:$xj, v8f32:$xk)>;
++def : Pat<(loongarch_vpackev v4f64:$xj, v4f64:$xk),
++          (XVPACKEV_D v4f64:$xj, v4f64:$xk)>;
++
++// XVPACKOD_{B/H/W/D}
++def : Pat<(loongarch_vpackod v32i8:$xj, v32i8:$xk),
++          (XVPACKOD_B v32i8:$xj, v32i8:$xk)>;
++def : Pat<(loongarch_vpackod v16i16:$xj, v16i16:$xk),
++          (XVPACKOD_H v16i16:$xj, v16i16:$xk)>;
++def : Pat<(loongarch_vpackod v8i32:$xj, v8i32:$xk),
++          (XVPACKOD_W v8i32:$xj, v8i32:$xk)>;
++def : Pat<(loongarch_vpackod v4i64:$xj, v4i64:$xk),
++          (XVPACKOD_D v4i64:$xj, v4i64:$xk)>;
++def : Pat<(loongarch_vpackod v8f32:$xj, v8f32:$xk),
++          (XVPACKOD_W v8f32:$xj, v8f32:$xk)>;
++def : Pat<(loongarch_vpackod v4f64:$xj, v4f64:$xk),
++          (XVPACKOD_D v4f64:$xj, v4f64:$xk)>;
++
++// XVILVL_{B/H/W/D}
++def : Pat<(loongarch_vilvl v32i8:$xj, v32i8:$xk),
++          (XVILVL_B v32i8:$xj, v32i8:$xk)>;
++def : Pat<(loongarch_vilvl v16i16:$xj, v16i16:$xk),
++          (XVILVL_H v16i16:$xj, v16i16:$xk)>;
++def : Pat<(loongarch_vilvl v8i32:$xj, v8i32:$xk),
++          (XVILVL_W v8i32:$xj, v8i32:$xk)>;
++def : Pat<(loongarch_vilvl v4i64:$xj, v4i64:$xk),
++          (XVILVL_D v4i64:$xj, v4i64:$xk)>;
++def : Pat<(loongarch_vilvl v8f32:$xj, v8f32:$xk),
++          (XVILVL_W v8f32:$xj, v8f32:$xk)>;
++def : Pat<(loongarch_vilvl v4f64:$xj, v4f64:$xk),
++          (XVILVL_D v4f64:$xj, v4f64:$xk)>;
++
++// XVILVH_{B/H/W/D}
++def : Pat<(loongarch_vilvh v32i8:$xj, v32i8:$xk),
++          (XVILVH_B v32i8:$xj, v32i8:$xk)>;
++def : Pat<(loongarch_vilvh v16i16:$xj, v16i16:$xk),
++          (XVILVH_H v16i16:$xj, v16i16:$xk)>;
++def : Pat<(loongarch_vilvh v8i32:$xj, v8i32:$xk),
++          (XVILVH_W v8i32:$xj, v8i32:$xk)>;
++def : Pat<(loongarch_vilvh v4i64:$xj, v4i64:$xk),
++          (XVILVH_D v4i64:$xj, v4i64:$xk)>;
++def : Pat<(loongarch_vilvh v8f32:$xj, v8f32:$xk),
++          (XVILVH_W v8f32:$xj, v8f32:$xk)>;
++def : Pat<(loongarch_vilvh v4f64:$xj, v4f64:$xk),
++          (XVILVH_D v4f64:$xj, v4f64:$xk)>;
++
++// XVSHUF4I_{B/H/W}
++def : Pat<(loongarch_vshuf4i v32i8:$xj, immZExt8:$ui8),
++          (XVSHUF4I_B v32i8:$xj, immZExt8:$ui8)>;
++def : Pat<(loongarch_vshuf4i v16i16:$xj, immZExt8:$ui8),
++        (XVSHUF4I_H v16i16:$xj, immZExt8:$ui8)>;
++def : Pat<(loongarch_vshuf4i v8i32:$xj, immZExt8:$ui8),
++        (XVSHUF4I_W v8i32:$xj, immZExt8:$ui8)>;
++def : Pat<(loongarch_vshuf4i v8f32:$xj, immZExt8:$ui8),
++        (XVSHUF4I_W v8f32:$xj, immZExt8:$ui8)>;
++
++// XVREPL128VEI_{B/H/W/D}
++def : Pat<(loongarch_vreplvei v32i8:$xj, immZExt4:$ui4),
++          (XVREPL128VEI_B v32i8:$xj, immZExt4:$ui4)>;
++def : Pat<(loongarch_vreplvei v16i16:$xj, immZExt3:$ui3),
++        (XVREPL128VEI_H v16i16:$xj, immZExt3:$ui3)>;
++def : Pat<(loongarch_vreplvei v8i32:$xj, immZExt2:$ui2),
++        (XVREPL128VEI_W v8i32:$xj, immZExt2:$ui2)>;
++def : Pat<(loongarch_vreplvei v4i64:$xj, immZExt1:$ui1),
++        (XVREPL128VEI_D v4i64:$xj, immZExt1:$ui1)>;
++def : Pat<(loongarch_vreplvei v8f32:$xj, immZExt2:$ui2),
++        (XVREPL128VEI_W v8f32:$xj, immZExt2:$ui2)>;
++def : Pat<(loongarch_vreplvei v4f64:$xj, immZExt1:$ui1),
++        (XVREPL128VEI_D v4f64:$xj, immZExt1:$ui1)>;
++
++// XVPERMI_D
++def : Pat<(loongarch_xvpermi v4i64:$xj, immZExt8: $ui8),
++          (XVPERMI_D v4i64:$xj, immZExt8: $ui8)>;
++def : Pat<(loongarch_xvpermi v4f64:$xj, immZExt8: $ui8),
++          (XVPERMI_D v4f64:$xj, immZExt8: $ui8)>;
++
+ // XVREPLVE0_{W/D}
+ def : Pat<(lasxsplatf32 FPR32:$fj),
+           (XVREPLVE0_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32))>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index 99ac2f3c162f..3519fa3142c3 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -15,6 +15,15 @@ def SDT_LoongArchVreplve : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
+                                          SDTCisSameAs<0, 1>, SDTCisInt<2>]>;
+ def SDT_LoongArchVecCond : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<1>]>;
+ 
++def SDT_LoongArchVShuf : SDTypeProfile<1, 3, [SDTCisVec<0>,
++                                         SDTCisInt<1>, SDTCisVec<1>,
++                                         SDTCisSameAs<0, 2>,
++                                         SDTCisSameAs<2, 3>]>;
++def SDT_LoongArchV2R : SDTypeProfile<1, 2, [SDTCisVec<0>,
++                                         SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>;
++def SDT_loongArchV1RUimm: SDTypeProfile<1, 2, [SDTCisVec<0>,
++                                         SDTCisSameAs<0,1>, SDTCisVT<2, i64>]>;
++
+ // Target nodes.
+ def loongarch_vreplve : SDNode<"LoongArchISD::VREPLVE", SDT_LoongArchVreplve>;
+ def loongarch_vall_nonzero : SDNode<"LoongArchISD::VALL_NONZERO",
+@@ -31,6 +40,23 @@ def loongarch_vpick_sext_elt : SDNode<"LoongArchISD::VPICK_SEXT_ELT",
+ def loongarch_vpick_zext_elt : SDNode<"LoongArchISD::VPICK_ZEXT_ELT",
+                                       SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>>;
+ 
++def loongarch_vshuf: SDNode<"LoongArchISD::VSHUF", SDT_LoongArchVShuf>;
++def loongarch_vpickev: SDNode<"LoongArchISD::VPICKEV", SDT_LoongArchV2R>;
++def loongarch_vpickod: SDNode<"LoongArchISD::VPICKOD", SDT_LoongArchV2R>;
++def loongarch_vpackev: SDNode<"LoongArchISD::VPACKEV", SDT_LoongArchV2R>;
++def loongarch_vpackod: SDNode<"LoongArchISD::VPACKOD", SDT_LoongArchV2R>;
++def loongarch_vilvl: SDNode<"LoongArchISD::VILVL", SDT_LoongArchV2R>;
++def loongarch_vilvh: SDNode<"LoongArchISD::VILVH", SDT_LoongArchV2R>;
++
++def loongarch_vshuf4i: SDNode<"LoongArchISD::VSHUF4I", SDT_loongArchV1RUimm>;
++def loongarch_vreplvei: SDNode<"LoongArchISD::VREPLVEI", SDT_loongArchV1RUimm>;
++
++def immZExt1 : ImmLeaf<i64, [{return isUInt<1>(Imm);}]>;
++def immZExt2 : ImmLeaf<i64, [{return isUInt<2>(Imm);}]>;
++def immZExt3 : ImmLeaf<i64, [{return isUInt<3>(Imm);}]>;
++def immZExt4 : ImmLeaf<i64, [{return isUInt<4>(Imm);}]>;
++def immZExt8 : ImmLeaf<i64, [{return isUInt<8>(Imm);}]>;
++
+ class VecCond<SDPatternOperator OpNode, ValueType TyNode,
+               RegisterClass RC = LSX128>
+     : Pseudo<(outs GPR:$rd), (ins RC:$vj),
+@@ -1678,6 +1704,128 @@ def : Pat<(loongarch_vreplve v4i32:$vj, GRLenVT:$rk),
+ def : Pat<(loongarch_vreplve v2i64:$vj, GRLenVT:$rk),
+           (VREPLVE_D v2i64:$vj, GRLenVT:$rk)>;
+ 
++// VSHUF_{B/H/W/D}
++def : Pat<(loongarch_vshuf v16i8:$va, v16i8:$vj, v16i8:$vk),
++          (VSHUF_B v16i8:$vj, v16i8:$vk, v16i8:$va)>;
++def : Pat<(loongarch_vshuf v8i16:$vd, v8i16:$vj, v8i16:$vk),
++          (VSHUF_H v8i16:$vd, v8i16:$vj, v8i16:$vk)>;
++def : Pat<(loongarch_vshuf v4i32:$vd, v4i32:$vj, v4i32:$vk),
++          (VSHUF_W v4i32:$vd, v4i32:$vj, v4i32:$vk)>;
++def : Pat<(loongarch_vshuf v2i64:$vd, v2i64:$vj, v2i64:$vk),
++          (VSHUF_D v2i64:$vd, v2i64:$vj, v2i64:$vk)>;
++def : Pat<(loongarch_vshuf v4i32:$vd, v4f32:$vj, v4f32:$vk),
++          (VSHUF_W v4i32:$vd, v4f32:$vj, v4f32:$vk)>;
++def : Pat<(loongarch_vshuf v2i64:$vd, v2f64:$vj, v2f64:$vk),
++          (VSHUF_D v2i64:$vd, v2f64:$vj, v2f64:$vk)>;
++
++// VPICKEV_{B/H/W/D}
++def : Pat<(loongarch_vpickev v16i8:$vj, v16i8:$vk),
++          (VPICKEV_B v16i8:$vj, v16i8:$vk)>;
++def : Pat<(loongarch_vpickev v8i16:$vj, v8i16:$vk),
++          (VPICKEV_H v8i16:$vj, v8i16:$vk)>;
++def : Pat<(loongarch_vpickev v4i32:$vj, v4i32:$vk),
++          (VPICKEV_W v4i32:$vj, v4i32:$vk)>;
++def : Pat<(loongarch_vpickev v2i64:$vj, v2i64:$vk),
++          (VPICKEV_D v2i64:$vj, v2i64:$vk)>;
++def : Pat<(loongarch_vpickev v4f32:$vj, v4f32:$vk),
++          (VPICKEV_W v4f32:$vj, v4f32:$vk)>;
++def : Pat<(loongarch_vpickev v2f64:$vj, v2f64:$vk),
++          (VPICKEV_D v2f64:$vj, v2f64:$vk)>;
++
++// VPICKOD_{B/H/W/D}
++def : Pat<(loongarch_vpickod v16i8:$vj, v16i8:$vk),
++          (VPICKOD_B v16i8:$vj, v16i8:$vk)>;
++def : Pat<(loongarch_vpickod v8i16:$vj, v8i16:$vk),
++          (VPICKOD_H v8i16:$vj, v8i16:$vk)>;
++def : Pat<(loongarch_vpickod v4i32:$vj, v4i32:$vk),
++          (VPICKOD_W v4i32:$vj, v4i32:$vk)>;
++def : Pat<(loongarch_vpickod v2i64:$vj, v2i64:$vk),
++          (VPICKOD_D v2i64:$vj, v2i64:$vk)>;
++def : Pat<(loongarch_vpickod v4f32:$vj, v4f32:$vk),
++          (VPICKOD_W v4f32:$vj, v4f32:$vk)>;
++def : Pat<(loongarch_vpickod v2f64:$vj, v2f64:$vk),
++          (VPICKOD_D v2f64:$vj, v2f64:$vk)>;
++
++// VPACKEV_{B/H/W/D}
++def : Pat<(loongarch_vpackev v16i8:$vj, v16i8:$vk),
++          (VPACKEV_B v16i8:$vj, v16i8:$vk)>;
++def : Pat<(loongarch_vpackev v8i16:$vj, v8i16:$vk),
++          (VPACKEV_H v8i16:$vj, v8i16:$vk)>;
++def : Pat<(loongarch_vpackev v4i32:$vj, v4i32:$vk),
++          (VPACKEV_W v4i32:$vj, v4i32:$vk)>;
++def : Pat<(loongarch_vpackev v2i64:$vj, v2i64:$vk),
++          (VPACKEV_D v2i64:$vj, v2i64:$vk)>;
++def : Pat<(loongarch_vpackev v4f32:$vj, v4f32:$vk),
++          (VPACKEV_W v4f32:$vj, v4f32:$vk)>;
++def : Pat<(loongarch_vpackev v2f64:$vj, v2f64:$vk),
++          (VPACKEV_D v2f64:$vj, v2f64:$vk)>;
++
++// VPACKOD_{B/H/W/D}
++def : Pat<(loongarch_vpackod v16i8:$vj, v16i8:$vk),
++          (VPACKOD_B v16i8:$vj, v16i8:$vk)>;
++def : Pat<(loongarch_vpackod v8i16:$vj, v8i16:$vk),
++          (VPACKOD_H v8i16:$vj, v8i16:$vk)>;
++def : Pat<(loongarch_vpackod v4i32:$vj, v4i32:$vk),
++          (VPACKOD_W v4i32:$vj, v4i32:$vk)>;
++def : Pat<(loongarch_vpackod v2i64:$vj, v2i64:$vk),
++          (VPACKOD_D v2i64:$vj, v2i64:$vk)>;
++def : Pat<(loongarch_vpackod v4f32:$vj, v4f32:$vk),
++          (VPACKOD_W v4f32:$vj, v4f32:$vk)>;
++def : Pat<(loongarch_vpackod v2f64:$vj, v2f64:$vk),
++          (VPACKOD_D v2f64:$vj, v2f64:$vk)>;
++
++// VILVL_{B/H/W/D}
++def : Pat<(loongarch_vilvl v16i8:$vj, v16i8:$vk),
++          (VILVL_B v16i8:$vj, v16i8:$vk)>;
++def : Pat<(loongarch_vilvl v8i16:$vj, v8i16:$vk),
++          (VILVL_H v8i16:$vj, v8i16:$vk)>;
++def : Pat<(loongarch_vilvl v4i32:$vj, v4i32:$vk),
++          (VILVL_W v4i32:$vj, v4i32:$vk)>;
++def : Pat<(loongarch_vilvl v2i64:$vj, v2i64:$vk),
++          (VILVL_D v2i64:$vj, v2i64:$vk)>;
++def : Pat<(loongarch_vilvl v4f32:$vj, v4f32:$vk),
++          (VILVL_W v4f32:$vj, v4f32:$vk)>;
++def : Pat<(loongarch_vilvl v2f64:$vj, v2f64:$vk),
++          (VILVL_D v2f64:$vj, v2f64:$vk)>;
++
++// VILVH_{B/H/W/D}
++def : Pat<(loongarch_vilvh v16i8:$vj, v16i8:$vk),
++          (VILVH_B v16i8:$vj, v16i8:$vk)>;
++def : Pat<(loongarch_vilvh v8i16:$vj, v8i16:$vk),
++          (VILVH_H v8i16:$vj, v8i16:$vk)>;
++def : Pat<(loongarch_vilvh v4i32:$vj, v4i32:$vk),
++          (VILVH_W v4i32:$vj, v4i32:$vk)>;
++def : Pat<(loongarch_vilvh v2i64:$vj, v2i64:$vk),
++          (VILVH_D v2i64:$vj, v2i64:$vk)>;
++def : Pat<(loongarch_vilvh v4f32:$vj, v4f32:$vk),
++          (VILVH_W v4f32:$vj, v4f32:$vk)>;
++def : Pat<(loongarch_vilvh v2f64:$vj, v2f64:$vk),
++          (VILVH_D v2f64:$vj, v2f64:$vk)>;
++
++// VSHUF4I_{B/H/W}
++def : Pat<(loongarch_vshuf4i v16i8:$vj, immZExt8:$ui8),
++          (VSHUF4I_B v16i8:$vj, immZExt8:$ui8)>;
++def : Pat<(loongarch_vshuf4i v8i16:$vj, immZExt8:$ui8),
++        (VSHUF4I_H v8i16:$vj, immZExt8:$ui8)>;
++def : Pat<(loongarch_vshuf4i v4i32:$vj, immZExt8:$ui8),
++        (VSHUF4I_W v4i32:$vj, immZExt8:$ui8)>;
++def : Pat<(loongarch_vshuf4i v4f32:$vj, immZExt8:$ui8),
++        (VSHUF4I_W v4f32:$vj, immZExt8:$ui8)>;
++
++// VREPLVEI_{B/H/W/D}
++def : Pat<(loongarch_vreplvei v16i8:$vj, immZExt4:$ui4),
++          (VREPLVEI_B v16i8:$vj, immZExt4:$ui4)>;
++def : Pat<(loongarch_vreplvei v8i16:$vj, immZExt3:$ui3),
++        (VREPLVEI_H v8i16:$vj, immZExt3:$ui3)>;
++def : Pat<(loongarch_vreplvei v4i32:$vj, immZExt2:$ui2),
++        (VREPLVEI_W v4i32:$vj, immZExt2:$ui2)>;
++def : Pat<(loongarch_vreplvei v2i64:$vj, immZExt1:$ui1),
++        (VREPLVEI_D v2i64:$vj, immZExt1:$ui1)>;
++def : Pat<(loongarch_vreplvei v4f32:$vj, immZExt2:$ui2),
++        (VREPLVEI_W v4f32:$vj, immZExt2:$ui2)>;
++def : Pat<(loongarch_vreplvei v2f64:$vj, immZExt1:$ui1),
++        (VREPLVEI_D v2f64:$vj, immZExt1:$ui1)>;
++
+ // VREPLVEI_{W/D}
+ def : Pat<(lsxsplatf32 FPR32:$fj),
+           (VREPLVEI_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), 0)>;
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll
+new file mode 100644
+index 000000000000..22ab19b9fa44
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvilv.ll
+@@ -0,0 +1,74 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
++
++;; xvilvl.b
++define <32 x i8> @shufflevector_xvilvl_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_xvilvl_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvilvl.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39,
++                                                               i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
++    ret <32 x i8> %c
++}
++
++;; xvilvl.h
++define <16 x i16> @shufflevector_xvilvl_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_xvilvl_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvilvl.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
++    ret <16 x i16> %c
++}
++
++;; xvilvl.w
++define <8 x i32> @shufflevector_xvilvl_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_xvilvl_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvilvl.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
++    ret <8 x i32> %c
++}
++
++;; xvilvh.b
++define <32 x i8> @shufflevector_xvilvh_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_xvilvh_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvilvh.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47,
++                                                               i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
++    ret <32 x i8> %c
++}
++
++;; xvilvh.h
++define <16 x i16> @shufflevector_xvilvh_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_xvilvh_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvilvh.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
++    ret <16 x i16> %c
++}
++
++;; xvilvh.w
++define <8 x i32> @shufflevector_xvilvh_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_xvilvh_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvilvh.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
++    ret <8 x i32> %c
++}
++
++;; xvilvh.w
++define <8 x float> @shufflevector_xvilvh_v8f32(<8 x float> %a, <8 x float> %b) {
++; CHECK-LABEL: shufflevector_xvilvh_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvilvh.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
++    ret <8 x float> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll
+new file mode 100644
+index 000000000000..2ff9af4069b9
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpack.ll
+@@ -0,0 +1,124 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
++
++;; xvpackev.b
++define <32 x i8> @shufflevector_pack_ev_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackev.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46,
++                                                               i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
++    ret <32 x i8> %c
++}
++
++;; xvpackev.h
++define <16 x i16> @shufflevector_pack_ev_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackev.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
++    ret <16 x i16> %c
++}
++
++;; xvpackev.w
++define <8 x i32> @shufflevector_pack_ev_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackev.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
++    ret <8 x i32> %c
++}
++
++;; xvpickev.d/xvpackev.d/xvilvl.d
++define <4 x i64> @shufflevector_pack_ev_v4i64(<4 x i64> %a, <4 x i64> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v4i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackev.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
++    ret <4 x i64> %c
++}
++
++;; xvpackev.w
++define <8 x float> @shufflevector_pack_ev_v8f32(<8 x float> %a, <8 x float> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackev.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
++    ret <8 x float> %c
++}
++
++;; xvpickev.d/xvpackev.d/xvilvl.d
++define <4 x double> @shufflevector_pack_ev_v4f64(<4 x double> %a, <4 x double> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v4f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackev.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
++    ret <4 x double> %c
++}
++
++;; xvpackod.b
++define <32 x i8> @shufflevector_pack_od_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_pack_od_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackod.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 33, i32 3, i32 35, i32 5, i32 37, i32 7, i32 39, i32 9, i32 41, i32 11, i32 43, i32 13, i32 45, i32 15, i32 47,
++                                                              i32 17, i32 49, i32 19, i32 51, i32 21, i32 53, i32 23, i32 55, i32 25, i32 57, i32 27, i32 59, i32 29, i32 61, i32 31, i32 63>
++    ret <32 x i8> %c
++}
++
++;; xvpackod.h
++define <16 x i16> @shufflevector_pack_od_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_pack_od_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackod.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
++    ret <16 x i16> %c
++}
++
++;; xvpackod.w
++define <8 x i32> @shufflevector_pack_od_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_pack_od_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackod.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
++    ret <8 x i32> %c
++}
++
++;; xvpickod.d/xvpackod.d/xvilvh.d
++define <4 x i64> @shufflodector_pack_od_v4i64(<4 x i64> %a, <4 x i64> %b) {
++; CHECK-LABEL: shufflodector_pack_od_v4i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackod.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
++    ret <4 x i64> %c
++}
++
++;; xvpackod.w
++define <8 x float> @shufflodector_pack_od_v8f32(<8 x float> %a, <8 x float> %b) {
++; CHECK-LABEL: shufflodector_pack_od_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackod.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
++    ret <8 x float> %c
++}
++
++;; xvpickod.d/xvpackod.d/xvilvh.d
++define <4 x double> @shufflodector_pack_od_v4f64(<4 x double> %a, <4 x double> %b) {
++; CHECK-LABEL: shufflodector_pack_od_v4f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpackod.d $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
++    ret <4 x double> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll
+new file mode 100644
+index 000000000000..294d292d1764
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvpick.ll
+@@ -0,0 +1,84 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
++
++;; xvpickev.b
++define <32 x i8> @shufflevector_pick_ev_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_pick_ev_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpickev.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46,
++                                                               i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
++    ret <32 x i8> %c
++}
++
++;; xvpickev.h
++define <16 x i16> @shufflevector_pick_ev_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_pick_ev_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpickev.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
++    ret <16 x i16> %c
++}
++
++;; xvpickev.w
++define <8 x i32> @shufflevector_pick_ev_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_pick_ev_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpickev.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
++    ret <8 x i32> %c
++}
++
++;; xvpickev.w
++define <8 x float> @shufflevector_pick_ev_v8f32(<8 x float> %a, <8 x float> %b) {
++; CHECK-LABEL: shufflevector_pick_ev_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpickev.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
++    ret <8 x float> %c
++}
++
++;; xvpickod.b
++define <32 x i8> @shufflevector_pick_od_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_pick_od_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpickod.b $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47,
++                                                               i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
++    ret <32 x i8> %c
++}
++
++;; xvpickod.h
++define <16 x i16> @shufflevector_pick_od_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_pick_od_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpickod.h $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
++    ret <16 x i16> %c
++}
++
++;; xvpickod.w
++define <8 x i32> @shufflevector_pick_od_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_pick_od_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpickod.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
++    ret <8 x i32> %c
++}
++
++;; xvpickod.w
++define <8 x float> @shufflodector_pick_od_v8f32(<8 x float> %a, <8 x float> %b) {
++; CHECK-LABEL: shufflodector_pick_od_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpickod.w $xr0, $xr1, $xr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
++    ret <8 x float> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
+new file mode 100644
+index 000000000000..dce1e4b777e2
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll
+@@ -0,0 +1,65 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
++
++;; xvrepl128vei.b
++define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvrepl128vei.b $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,
++                                                               i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
++    ret <32 x i8> %c
++}
++
++;; xvrepl128vei.h
++define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvrepl128vei.h $xr0, $xr0, 3
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3,
++                                                                 i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11>
++    ret <16 x i16> %c
++}
++
++;; xvrepl128vei.w
++define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 78
++; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 3
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 3, i32 3, i32 3, i32 3>
++    ret <8 x i32> %c
++}
++
++;; xvrepl128vei.d
++define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) {
++; CHECK-LABEL: shufflevector_v4i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr0, 1
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
++    ret <4 x i64> %c
++}
++
++;; xvrepl128vei.w
++define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) {
++; CHECK-LABEL: shufflevector_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 3
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
++    ret <8 x float> %c
++}
++
++;; xvrepl128vei.d
++define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
++; CHECK-LABEL: shufflevector_v4f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvrepl128vei.d $xr0, $xr1, 1
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 5, i32 7, i32 7>
++    ret <4 x double> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll
+new file mode 100644
+index 000000000000..fce32647da3d
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll
+@@ -0,0 +1,76 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
++
++;; xvshuf.b
++define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_0)
++; CHECK-NEXT:    xvld $xr2, $a0, 0
++; CHECK-NEXT:    xvshuf.b $xr0, $xr1, $xr0, $xr2
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39,
++                                                               i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
++    ret <32 x i8> %c
++}
++
++;; xvshuf.h
++define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
++; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 78
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI1_0)
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvshuf.h $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 27, i32 26, i32 25, i32 24,
++                                                                 i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3>
++    ret <16 x i16> %c
++}
++
++;; xvshuf.w
++define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 68
++; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 68
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvshuf.w $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 9, i32 3, i32 2, i32 8, i32 9, i32 3, i32 2>
++    ret <8 x i32> %c
++}
++
++;; xvshuf.d
++define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) {
++; CHECK-LABEL: shufflevector_v4i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 238
++; CHECK-NEXT:    xvpermi.d $xr1, $xr1, 238
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_0)
++; CHECK-NEXT:    xvld $xr0, $a0, 0
++; CHECK-NEXT:    xvshuf.d $xr0, $xr1, $xr2
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
++    ret <4 x i64> %c
++}
++
++;; xvshuf.w
++define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) {
++; CHECK-LABEL: shufflevector_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI4_0)
++; CHECK-NEXT:    xvld $xr2, $a0, 0
++; CHECK-NEXT:    xvshuf.w $xr2, $xr1, $xr0
++; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 0, i32 10, i32 9, i32 4, i32 5, i32 12, i32 13>
++    ret <8 x float> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
+new file mode 100644
+index 000000000000..dc4532a7292a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll
+@@ -0,0 +1,43 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
++
++;; xxvshuf4i.b
++define <32 x i8> @shufflevector_xvshuf4i_v32i8(<32 x i8> %a, <32 x i8> %b) {
++; CHECK-LABEL: shufflevector_xvshuf4i_v32i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvshuf4i.b $xr0, $xr0, 27
++; CHECK-NEXT:    ret
++    %c = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12,
++                                                               i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20, i32 27, i32 26, i32 25, i32 24, i32 31, i32 30, i32 29, i32 28>
++    ret <32 x i8> %c
++}
++
++;; xvshuf4i.h
++define <16 x i16> @shufflevector_xvshuf4i_v16i16(<16 x i16> %a, <16 x i16> %b) {
++; CHECK-LABEL: shufflevector_xvshuf4i_v16i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvshuf4i.h $xr0, $xr0, 27
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
++    ret <16 x i16> %c
++}
++
++;; xvshuf4i.w
++define <8 x i32> @shufflevector_xvshuf4i_v8i32(<8 x i32> %a, <8 x i32> %b) {
++; CHECK-LABEL: shufflevector_xvshuf4i_v8i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 27
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
++    ret <8 x i32> %c
++}
++
++;; xvshuf4i.w
++define <8 x float> @shufflevector_xvshuf4i_v8f32(<8 x float> %a, <8 x float> %b) {
++; CHECK-LABEL: shufflevector_xvshuf4i_v8f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    xvshuf4i.w $xr0, $xr0, 27
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
++    ret <8 x float> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll
+new file mode 100644
+index 000000000000..31398c6081c0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vilv.ll
+@@ -0,0 +1,82 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
++
++;; vilvl.b
++define <16 x i8> @shufflevector_vilvl_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_vilvl_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vilvl.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
++    ret <16 x i8> %c
++}
++
++;; vilvl.h
++define <8 x i16> @shufflevector_vilvl_v8i16(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_vilvl_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vilvl.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
++    ret <8 x i16> %c
++}
++
++;; vilvl.w
++define <4 x i32> @shufflevector_vilvl_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_vilvl_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
++    ret <4 x i32> %c
++}
++
++;; vilvl.w
++define <4 x float> @shufflevector_vilvl_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflevector_vilvl_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vilvl.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
++    ret <4 x float> %c
++}
++
++;; vilvh.b
++define <16 x i8> @shufflevector_vilvh_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_vilvh_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vilvh.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
++    ret <16 x i8> %c
++}
++
++;; vilvh.h
++define <8 x i16> @shufflevector_vilvh_v8i16(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_vilvh_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vilvh.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
++    ret <8 x i16> %c
++}
++
++;; vilvh.w
++define <4 x i32> @shufflevector_vilvh_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_vilvh_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
++    ret <4 x i32> %c
++}
++
++;; vilvh.w
++define <4 x float> @shufflevector_vilvh_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflevector_vilvh_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vilvh.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
++    ret <4 x float> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll
+new file mode 100644
+index 000000000000..171e68306cd1
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpack.ll
+@@ -0,0 +1,122 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
++
++;; vpackev.b
++define <16 x i8> @shufflevector_pack_ev_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackev.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
++    ret <16 x i8> %c
++}
++
++;; vpackev.h
++define <8 x i16> @shufflevector_pack_ev_v8i16(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackev.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
++    ret <8 x i16> %c
++}
++
++;; vpackev.w
++define <4 x i32> @shufflevector_pack_ev_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackev.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
++    ret <4 x i32> %c
++}
++
++;; vpickev.d/vpackev.d/vilvl.d
++define <2 x i64> @shufflevector_pack_ev_v2i64(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v2i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackev.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
++    ret <2 x i64> %c
++}
++
++;; vpackev.w
++define <4 x float> @shufflevector_pack_ev_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackev.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
++    ret <4 x float> %c
++}
++
++;; vpickev.d/vpackev.d/vilvl.d
++define <2 x double> @shufflevector_pack_ev_v2f64(<2 x double> %a, <2 x double> %b) {
++; CHECK-LABEL: shufflevector_pack_ev_v2f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackev.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
++    ret <2 x double> %c
++}
++
++;; vpackod.b
++define <16 x i8> @shufflevector_pack_od_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_pack_od_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackod.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
++    ret <16 x i8> %c
++}
++
++;; vpackod.h
++define <8 x i16> @shufflevector_pack_od_v8i16(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_pack_od_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackod.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
++    ret <8 x i16> %c
++}
++
++;; vpackod.w
++define <4 x i32> @shufflevector_pack_od_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_pack_od_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackod.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
++    ret <4 x i32> %c
++}
++
++;; vpickod.d/vpackod.d/vilvh.d
++define <2 x i64> @shufflodector_pack_od_v2i64(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: shufflodector_pack_od_v2i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackod.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
++    ret <2 x i64> %c
++}
++
++;; vpackod.w
++define <4 x float> @shufflodector_pack_od_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflodector_pack_od_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackod.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
++    ret <4 x float> %c
++}
++
++;; vpickod.d/vpackod.d/vilvh.d
++define <2 x double> @shufflodector_pack_od_v2f64(<2 x double> %a, <2 x double> %b) {
++; CHECK-LABEL: shufflodector_pack_od_v2f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpackod.d $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
++    ret <2 x double> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll
+new file mode 100644
+index 000000000000..ca636d942b58
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vpick.ll
+@@ -0,0 +1,82 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
++
++;; vpickev.b
++define <16 x i8> @shufflevector_pick_ev_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_pick_ev_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpickev.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
++    ret <16 x i8> %c
++}
++
++;; vpickev.h
++define <8 x i16> @shufflevector_pick_ev_v8i16(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_pick_ev_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpickev.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
++    ret <8 x i16> %c
++}
++
++;; vpickev.w
++define <4 x i32> @shufflevector_pick_ev_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_pick_ev_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpickev.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
++    ret <4 x i32> %c
++}
++
++;; vpickev.w
++define <4 x float> @shufflevector_pick_ev_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflevector_pick_ev_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpickev.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
++    ret <4 x float> %c
++}
++
++;; vpickod.b
++define <16 x i8> @shufflevector_pick_od_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_pick_od_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpickod.b $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
++    ret <16 x i8> %c
++}
++
++;; vpickod.h
++define <8 x i16> @shufflevector_pick_od_v8i16(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_pick_od_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpickod.h $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
++    ret <8 x i16> %c
++}
++
++;; vpickod.w
++define <4 x i32> @shufflevector_pick_od_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_pick_od_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpickod.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++    ret <4 x i32> %c
++}
++
++;; vpickod.w
++define <4 x float> @shufflodector_pick_od_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflodector_pick_od_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vpickod.w $vr0, $vr1, $vr0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
++    ret <4 x float> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll
+new file mode 100644
+index 000000000000..10510786f321
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll
+@@ -0,0 +1,62 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
++
++;; vreplvei.b
++define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vreplvei.b $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
++    ret <16 x i8> %c
++}
++
++;; vreplvei.h
++define <8 x i16> @shufflevector_v8i16(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vreplvei.h $vr0, $vr1, 2
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
++    ret <8 x i16> %c
++}
++
++;; vreplvei.w
++define <4 x i32> @shufflevector_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
++    ret <4 x i32> %c
++}
++
++;; vreplvei.d
++define <2 x i64> @shufflevector_v2i64(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: shufflevector_v2i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 1>
++    ret <2 x i64> %c
++}
++
++;; vreplvei.w
++define <4 x float> @shufflevector_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflevector_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
++    ret <4 x float> %c
++}
++
++;; vreplvei.d
++define <2 x double> @shufflevector_v2f64(<2 x double> %a, <2 x double> %b) {
++; CHECK-LABEL: shufflevector_v2f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 1
++; CHECK-NEXT:    ret
++    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
++    ret <2 x double> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll
+new file mode 100644
+index 000000000000..55800b31446b
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll
+@@ -0,0 +1,84 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
++
++define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_0)
++; CHECK-NEXT:    vld $vr2, $a0, 0
++; CHECK-NEXT:    vshuf.b $vr0, $vr1, $vr0, $vr2
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15, i32 2, i32 4, i32 6, i32 8, i32 25, i32 30, i32 31, i32 31>
++    ret <16 x i8> %c
++}
++
++;; vshuf.h
++define <8 x i16> @shufflevector_v8i16(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_v8i16:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI1_0)
++; CHECK-NEXT:    vld $vr2, $a0, 0
++; CHECK-NEXT:    vshuf.h $vr2, $vr1, $vr0
++; CHECK-NEXT:    vori.b $vr0, $vr2, 0
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 8, i32 10, i32 12, i32 15>
++    ret <8 x i16> %c
++}
++
++;; vshuf.w
++define <4 x i32> @shufflevector_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
++; CHECK-NEXT:    vld $vr2, $a0, 0
++; CHECK-NEXT:    vshuf.w $vr2, $vr1, $vr0
++; CHECK-NEXT:    vori.b $vr0, $vr2, 0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 5, i32 7>
++    ret <4 x i32> %c
++}
++
++;; vshuf.d
++define <2 x i64> @shufflevector_v2i64(<2 x i64> %a, <2 x i64> %b) {
++; CHECK-LABEL: shufflevector_v2i64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_0)
++; CHECK-NEXT:    vld $vr2, $a0, 0
++; CHECK-NEXT:    vshuf.d $vr2, $vr1, $vr0
++; CHECK-NEXT:    vori.b $vr0, $vr2, 0
++; CHECK-NEXT:    ret
++    %c = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
++    ret <2 x i64> %c
++}
++
++;; vshuf.w
++define <4 x float> @shufflevector_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflevector_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI4_0)
++; CHECK-NEXT:    vld $vr2, $a0, 0
++; CHECK-NEXT:    vshuf.w $vr2, $vr1, $vr0
++; CHECK-NEXT:    vori.b $vr0, $vr2, 0
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 5, i32 7>
++    ret <4 x float> %c
++}
++
++;; vshuf.d
++define <2 x double> @shufflevector_v2f64(<2 x double> %a, <2 x double> %b) {
++; CHECK-LABEL: shufflevector_v2f64:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
++; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_0)
++; CHECK-NEXT:    vld $vr2, $a0, 0
++; CHECK-NEXT:    vshuf.d $vr2, $vr1, $vr0
++; CHECK-NEXT:    vori.b $vr0, $vr2, 0
++; CHECK-NEXT:    ret
++    %c = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
++    ret <2 x double> %c
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll
+new file mode 100644
+index 000000000000..660b9581c3d1
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll
+@@ -0,0 +1,42 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
++
++;; vilvh.b
++define <16 x i8> @shufflevector_vshuf4i_v16i8(<16 x i8> %a, <16 x i8> %b) {
++; CHECK-LABEL: shufflevector_vshuf4i_v16i8:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vshuf4i.b $vr0, $vr0, 27
++; CHECK-NEXT:    ret
++    %c = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
++    ret <16 x i8> %c
++}
++
++;; vilvh.h
++define <8 x i16> @shufflevector_vshuf4i_v8i4(<8 x i16> %a, <8 x i16> %b) {
++; CHECK-LABEL: shufflevector_vshuf4i_v8i4:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vshuf4i.h $vr0, $vr0, 27
++; CHECK-NEXT:    ret
++    %c = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
++    ret <8 x i16> %c
++}
++
++;; vilvh.w
++define <4 x i32> @shufflevector_vshuf4i_v4i32(<4 x i32> %a, <4 x i32> %b) {
++; CHECK-LABEL: shufflevector_vshuf4i_v4i32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 27
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
++    ret <4 x i32> %c
++}
++
++;; vilvh.w
++define <4 x float> @shufflevector_vshuf4i_v4f32(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: shufflevector_vshuf4i_v4f32:
++; CHECK:       # %bb.0:
++; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 27
++; CHECK-NEXT:    ret
++    %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
++    ret <4 x float> %c
++}
+-- 
+2.20.1
+
+
+From cac0cc4649362e0b80f61e45aec54341f40f7f77 Mon Sep 17 00:00:00 2001
+From: Ami-zhang <zhanglimin@loongson.cn>
+Date: Wed, 17 Jan 2024 11:15:05 +0800
+Subject: [PATCH 08/12] [LoongArch] Add LoongArch V1.1 instructions definitions
+ and MC tests (#78238)
+
+LoongArch V1.1 instrucions include floating-point approximate reciprocal
+instructions and atomic instrucions. And add testcases for these
+instrucions meanwhile.
+
+(cherry picked from commit 84bdee2875da364be7eb2144b1ae530f6a05f0e2)
+---
+ .../LoongArch/LoongArchFloat32InstrInfo.td    |  2 +
+ .../LoongArch/LoongArchFloat64InstrInfo.td    |  2 +
+ .../Target/LoongArch/LoongArchInstrInfo.td    | 34 ++++++-
+ .../LoongArch/LoongArchLASXInstrInfo.td       |  4 +
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td |  4 +
+ llvm/test/MC/LoongArch/Basic/Float/d-arith.s  |  8 ++
+ llvm/test/MC/LoongArch/Basic/Float/f-arith.s  |  8 ++
+ llvm/test/MC/LoongArch/Basic/Integer/atomic.s | 92 +++++++++++++++++++
+ llvm/test/MC/LoongArch/lasx/frecip.s          |  8 ++
+ llvm/test/MC/LoongArch/lasx/frsqrt.s          |  8 ++
+ llvm/test/MC/LoongArch/lsx/frecip.s           |  8 ++
+ llvm/test/MC/LoongArch/lsx/frsqrt.s           |  8 ++
+ 12 files changed, 184 insertions(+), 2 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+index 65120c083f49..f30837912e75 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+@@ -50,6 +50,8 @@ def FNEG_S   : FP_ALU_2R<0x01141400>;
+ def FSQRT_S  : FP_ALU_2R<0x01144400>;
+ def FRECIP_S : FP_ALU_2R<0x01145400>;
+ def FRSQRT_S : FP_ALU_2R<0x01146400>;
++def FRECIPE_S : FP_ALU_2R<0x01147400>;
++def FRSQRTE_S : FP_ALU_2R<0x01148400>;
+ def FSCALEB_S : FP_ALU_3R<0x01108000>;
+ def FLOGB_S   : FP_ALU_2R<0x01142400>;
+ def FCOPYSIGN_S : FP_ALU_3R<0x01128000>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+index 437c1e4d7be2..0ea4c564b045 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+@@ -34,6 +34,8 @@ def FNEG_D   : FP_ALU_2R<0x01141800, FPR64>;
+ def FSQRT_D  : FP_ALU_2R<0x01144800, FPR64>;
+ def FRECIP_D : FP_ALU_2R<0x01145800, FPR64>;
+ def FRSQRT_D : FP_ALU_2R<0x01146800, FPR64>;
++def FRECIPE_D : FP_ALU_2R<0x01147800, FPR64>;
++def FRSQRTE_D : FP_ALU_2R<0x01148800, FPR64>;
+ def FSCALEB_D : FP_ALU_3R<0x01110000, FPR64>;
+ def FLOGB_D   : FP_ALU_2R<0x01142800, FPR64>;
+ def FCOPYSIGN_D : FP_ALU_3R<0x01130000, FPR64>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+index ecd0c2b71b85..756c460f916b 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+@@ -634,15 +634,24 @@ class AM_3R<bits<32> op>
+     : Fmt3R<op, (outs GPR:$rd), (ins GPR:$rk, GPRMemAtomic:$rj),
+             "$rd, $rk, $rj">;
+ 
+-let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
++let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+ class LLBase<bits<32> op>
+     : Fmt2RI14<op, (outs GPR:$rd), (ins GPR:$rj, simm14_lsl2:$imm14),
+                "$rd, $rj, $imm14">;
++class LLBase_ACQ<bits<32> op>
++    : Fmt2R<op, (outs GPR:$rd), (ins GPR:$rj), "$rd, $rj">;
++}
+ 
+-let hasSideEffects = 0, mayLoad = 0, mayStore = 1, Constraints = "$rd = $dst" in
++let hasSideEffects = 0, mayLoad = 0, mayStore = 1, Constraints = "$rd = $dst" in {
+ class SCBase<bits<32> op>
+     : Fmt2RI14<op, (outs GPR:$dst), (ins GPR:$rd, GPR:$rj, simm14_lsl2:$imm14),
+                "$rd, $rj, $imm14">;
++class SCBase_128<bits<32> op>
++    : Fmt3R<op, (outs GPR:$dst), (ins GPR:$rd, GPR:$rk, GPR:$rj),
++               "$rd, $rk, $rj">;
++class SCBase_REL<bits<32> op>
++    : Fmt2R<op, (outs GPR:$dst), (ins GPR:$rd, GPR:$rj), "$rd, $rj">;
++}
+ 
+ let hasSideEffects = 1 in
+ class IOCSRRD<bits<32> op>
+@@ -754,6 +763,8 @@ def PRELD : FmtPRELD<(outs), (ins uimm5:$imm5, GPR:$rj, simm12:$imm12),
+ // Atomic Memory Access Instructions
+ def LL_W : LLBase<0x20000000>;
+ def SC_W : SCBase<0x21000000>;
++def LLACQ_W : LLBase_ACQ<0x38578000>;
++def SCREL_W : SCBase_REL<0x38578400>;
+ 
+ // Barrier Instructions
+ def DBAR : MISC_I15<0x38720000>;
+@@ -875,8 +886,12 @@ def STLE_W : STORE_3R<0x387f0000>;
+ def STLE_D : STORE_3R<0x387f8000>;
+ 
+ // Atomic Memory Access Instructions for 64-bits
++def AMSWAP_B     : AM_3R<0x385c0000>;
++def AMSWAP_H     : AM_3R<0x385c8000>;
+ def AMSWAP_W     : AM_3R<0x38600000>;
+ def AMSWAP_D     : AM_3R<0x38608000>;
++def AMADD_B      : AM_3R<0x385d0000>;
++def AMADD_H      : AM_3R<0x385d8000>;
+ def AMADD_W      : AM_3R<0x38610000>;
+ def AMADD_D      : AM_3R<0x38618000>;
+ def AMAND_W      : AM_3R<0x38620000>;
+@@ -893,8 +908,12 @@ def AMMAX_WU     : AM_3R<0x38670000>;
+ def AMMAX_DU     : AM_3R<0x38678000>;
+ def AMMIN_WU     : AM_3R<0x38680000>;
+ def AMMIN_DU     : AM_3R<0x38688000>;
++def AMSWAP__DB_B : AM_3R<0x385e0000>;
++def AMSWAP__DB_H : AM_3R<0x385e8000>;
+ def AMSWAP__DB_W : AM_3R<0x38690000>;
+ def AMSWAP__DB_D : AM_3R<0x38698000>;
++def AMADD__DB_B  : AM_3R<0x385f0000>;
++def AMADD__DB_H  : AM_3R<0x385f8000>;
+ def AMADD__DB_W  : AM_3R<0x386a0000>;
+ def AMADD__DB_D  : AM_3R<0x386a8000>;
+ def AMAND__DB_W  : AM_3R<0x386b0000>;
+@@ -911,8 +930,19 @@ def AMMAX__DB_WU : AM_3R<0x38700000>;
+ def AMMAX__DB_DU : AM_3R<0x38708000>;
+ def AMMIN__DB_WU : AM_3R<0x38710000>;
+ def AMMIN__DB_DU : AM_3R<0x38718000>;
++def AMCAS_B     : AM_3R<0x38580000>;
++def AMCAS_H     : AM_3R<0x38588000>;
++def AMCAS_W     : AM_3R<0x38590000>;
++def AMCAS_D     : AM_3R<0x38598000>;
++def AMCAS__DB_B     : AM_3R<0x385a0000>;
++def AMCAS__DB_H     : AM_3R<0x385a8000>;
++def AMCAS__DB_W     : AM_3R<0x385b0000>;
++def AMCAS__DB_D     : AM_3R<0x385b8000>;
+ def LL_D : LLBase<0x22000000>;
+ def SC_D : SCBase<0x23000000>;
++def SC_Q : SCBase_128<0x38570000>;
++def LLACQ_D : LLBase_ACQ<0x38578800>;
++def SCREL_D : SCBase_REL<0x38578C00>;
+ 
+ // CRC Check Instructions
+ def CRC_W_B_W  : ALU_3R<0x00240000>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index 5b6721cdf1b4..454915ac8c0a 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -773,6 +773,10 @@ def XVFRECIP_S : LASX2R_XX<0x769cf400>;
+ def XVFRECIP_D : LASX2R_XX<0x769cf800>;
+ def XVFRSQRT_S : LASX2R_XX<0x769d0400>;
+ def XVFRSQRT_D : LASX2R_XX<0x769d0800>;
++def XVFRECIPE_S : LASX2R_XX<0x769d1400>;
++def XVFRECIPE_D : LASX2R_XX<0x769d1800>;
++def XVFRSQRTE_S : LASX2R_XX<0x769d2400>;
++def XVFRSQRTE_D : LASX2R_XX<0x769d2800>;
+ 
+ def XVFCVTL_S_H : LASX2R_XX<0x769de800>;
+ def XVFCVTH_S_H : LASX2R_XX<0x769dec00>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index 3519fa3142c3..6d60d7074ec3 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -918,6 +918,10 @@ def VFRECIP_S : LSX2R_VV<0x729cf400>;
+ def VFRECIP_D : LSX2R_VV<0x729cf800>;
+ def VFRSQRT_S : LSX2R_VV<0x729d0400>;
+ def VFRSQRT_D : LSX2R_VV<0x729d0800>;
++def VFRECIPE_S : LSX2R_VV<0x729d1400>;
++def VFRECIPE_D : LSX2R_VV<0x729d1800>;
++def VFRSQRTE_S : LSX2R_VV<0x729d2400>;
++def VFRSQRTE_D : LSX2R_VV<0x729d2800>;
+ 
+ def VFCVTL_S_H : LSX2R_VV<0x729de800>;
+ def VFCVTH_S_H : LSX2R_VV<0x729dec00>;
+diff --git a/llvm/test/MC/LoongArch/Basic/Float/d-arith.s b/llvm/test/MC/LoongArch/Basic/Float/d-arith.s
+index 6b2c67e9a2cc..8e19d2e34f3c 100644
+--- a/llvm/test/MC/LoongArch/Basic/Float/d-arith.s
++++ b/llvm/test/MC/LoongArch/Basic/Float/d-arith.s
+@@ -78,10 +78,18 @@ fsqrt.d $fa2, $ft3
+ # ASM: encoding: [0x7b,0x5b,0x14,0x01]
+ frecip.d $fs3, $fs3
+ 
++# ASM-AND-OBJ: frecipe.d $fa0, $fa0
++# ASM: encoding: [0x00,0x78,0x14,0x01]
++frecipe.d $fa0, $fa0
++
+ # ASM-AND-OBJ: frsqrt.d $ft14, $fa3
+ # ASM: encoding: [0x76,0x68,0x14,0x01]
+ frsqrt.d $ft14, $fa3
+ 
++# ASM-AND-OBJ: frsqrte.d $fa1, $fa1
++# ASM: encoding: [0x21,0x88,0x14,0x01]
++frsqrte.d $fa1, $fa1
++
+ # ASM-AND-OBJ: fscaleb.d $ft4, $ft6, $fs2
+ # ASM: encoding: [0xcc,0x69,0x11,0x01]
+ fscaleb.d $ft4, $ft6, $fs2
+diff --git a/llvm/test/MC/LoongArch/Basic/Float/f-arith.s b/llvm/test/MC/LoongArch/Basic/Float/f-arith.s
+index 155e783cf435..c32151adbf3b 100644
+--- a/llvm/test/MC/LoongArch/Basic/Float/f-arith.s
++++ b/llvm/test/MC/LoongArch/Basic/Float/f-arith.s
+@@ -73,10 +73,18 @@ fsqrt.s $fs3, $ft10
+ # ASM: encoding: [0x71,0x57,0x14,0x01]
+ frecip.s $ft9, $fs3
+ 
++# ASM-AND-OBJ: frecipe.s $fa0, $fa0
++# ASM: encoding: [0x00,0x74,0x14,0x01]
++frecipe.s $fa0, $fa0
++
+ # ASM-AND-OBJ: frsqrt.s $fs1, $ft4
+ # ASM: encoding: [0x99,0x65,0x14,0x01]
+ frsqrt.s $fs1, $ft4
+ 
++# ASM-AND-OBJ: frsqrte.s $fa1, $fa1
++# ASM: encoding: [0x21,0x84,0x14,0x01]
++frsqrte.s $fa1, $fa1
++
+ # ASM-AND-OBJ: fscaleb.s $ft13, $ft15, $fa6
+ # ASM: encoding: [0xf5,0x9a,0x10,0x01]
+ fscaleb.s $ft13, $ft15, $fa6
+diff --git a/llvm/test/MC/LoongArch/Basic/Integer/atomic.s b/llvm/test/MC/LoongArch/Basic/Integer/atomic.s
+index a35211db8851..69acdeef935c 100644
+--- a/llvm/test/MC/LoongArch/Basic/Integer/atomic.s
++++ b/llvm/test/MC/LoongArch/Basic/Integer/atomic.s
+@@ -21,6 +21,14 @@ ll.w $tp, $s4, 220
+ # CHECK-ASM: encoding: [0xd3,0x39,0x00,0x21]
+ sc.w $t7, $t2, 56
+ 
++# CHECK-ASM-AND-OBJ: llacq.w $t1, $t2
++# CHECK-ASM: encoding: [0xcd,0x81,0x57,0x38]
++llacq.w $t1, $t2
++
++# CHECK-ASM-AND-OBJ: screl.w $t1, $t2
++# CHECK-ASM: encoding: [0xcd,0x85,0x57,0x38]
++screl.w $t1, $t2
++
+ 
+ 
+ #############################################################
+@@ -29,6 +37,14 @@ sc.w $t7, $t2, 56
+ 
+ .ifdef LA64
+ 
++# CHECK64-ASM-AND-OBJ: amswap.b $a2, $t0, $s1
++# CHECK64-ASM: encoding: [0x06,0x33,0x5c,0x38]
++amswap.b $a2, $t0, $s1, 0
++
++# CHECK64-ASM-AND-OBJ: amswap.h $a2, $t0, $s1
++# CHECK64-ASM: encoding: [0x06,0xb3,0x5c,0x38]
++amswap.h $a2, $t0, $s1, 0
++
+ # CHECK64-ASM-AND-OBJ: amswap.w $a2, $t0, $s1
+ # CHECK64-ASM: encoding: [0x06,0x33,0x60,0x38]
+ amswap.w $a2, $t0, $s1, 0
+@@ -41,6 +57,14 @@ amswap.w $zero, $t0, $zero
+ # CHECK64-ASM: encoding: [0xa0,0x00,0x6a,0x38]
+ amadd_db.w $zero, $zero, $a1
+ 
++# CHECK64-ASM-AND-OBJ: amswap.b $a2, $t0, $s1
++# CHECK64-ASM: encoding: [0x06,0x33,0x5c,0x38]
++amswap.b $a2, $t0, $s1
++
++# CHECK64-ASM-AND-OBJ: amswap.h $a2, $t0, $s1
++# CHECK64-ASM: encoding: [0x06,0xb3,0x5c,0x38]
++amswap.h $a2, $t0, $s1
++
+ # CHECK64-ASM-AND-OBJ: amswap.w $a2, $t0, $s1
+ # CHECK64-ASM: encoding: [0x06,0x33,0x60,0x38]
+ amswap.w $a2, $t0, $s1
+@@ -49,6 +73,14 @@ amswap.w $a2, $t0, $s1
+ # CHECK64-ASM: encoding: [0xc2,0xba,0x60,0x38]
+ amswap.d $tp, $t2, $fp
+ 
++# CHECK64-ASM-AND-OBJ: amadd.b $a4, $t0, $r21
++# CHECK64-ASM: encoding: [0xa8,0x32,0x5d,0x38]
++amadd.b $a4, $t0, $r21
++
++# CHECK64-ASM-AND-OBJ: amadd.h $a1, $t5, $s6
++# CHECK64-ASM: encoding: [0xa5,0xc7,0x5d,0x38]
++amadd.h $a1, $t5, $s6
++
+ # CHECK64-ASM-AND-OBJ: amadd.w $a4, $t0, $r21
+ # CHECK64-ASM: encoding: [0xa8,0x32,0x61,0x38]
+ amadd.w $a4, $t0, $r21
+@@ -113,6 +145,14 @@ ammin.wu $a4, $t6, $s7
+ # CHECK64-ASM: encoding: [0x27,0xc3,0x68,0x38]
+ ammin.du $a3, $t4, $s2
+ 
++# CHECK64-ASM-AND-OBJ: amswap_db.b $a2, $t0, $s1
++# CHECK64-ASM: encoding: [0x06,0x33,0x5e,0x38]
++amswap_db.b $a2, $t0, $s1
++
++# CHECK64-ASM-AND-OBJ: amswap_db.h $tp, $t2, $fp
++# CHECK64-ASM: encoding: [0xc2,0xba,0x5e,0x38]
++amswap_db.h $tp, $t2, $fp
++
+ # CHECK64-ASM-AND-OBJ: amswap_db.w $a2, $t0, $s1
+ # CHECK64-ASM: encoding: [0x06,0x33,0x69,0x38]
+ amswap_db.w $a2, $t0, $s1
+@@ -121,6 +161,14 @@ amswap_db.w $a2, $t0, $s1
+ # CHECK64-ASM: encoding: [0xc2,0xba,0x69,0x38]
+ amswap_db.d $tp, $t2, $fp
+ 
++# CHECK64-ASM-AND-OBJ: amadd_db.b $zero, $zero, $a1
++# CHECK64-ASM: encoding: [0xa0,0x00,0x5f,0x38]
++amadd_db.b $zero, $zero, $a1
++
++# CHECK64-ASM-AND-OBJ: amadd_db.h $a4, $t0, $r21
++# CHECK64-ASM: encoding: [0xa8,0xb2,0x5f,0x38]
++amadd_db.h $a4, $t0, $r21
++
+ # CHECK64-ASM-AND-OBJ: amadd_db.w $a4, $t0, $r21
+ # CHECK64-ASM: encoding: [0xa8,0x32,0x6a,0x38]
+ amadd_db.w $a4, $t0, $r21
+@@ -185,6 +233,38 @@ ammin_db.wu $a4, $t6, $s7
+ # CHECK64-ASM: encoding: [0x27,0xc3,0x71,0x38]
+ ammin_db.du $a3, $t4, $s2
+ 
++# CHECK64-ASM-AND-OBJ: amcas.b $t1, $t2, $t3
++# CHECK64-ASM: encoding: [0xed,0x39,0x58,0x38]
++amcas.b $t1, $t2, $t3
++
++# CHECK64-ASM-AND-OBJ: amcas.h $t1, $t2, $t3
++# CHECK64-ASM: encoding: [0xed,0xb9,0x58,0x38]
++amcas.h $t1, $t2, $t3
++
++# CHECK64-ASM-AND-OBJ: amcas.w $t1, $t2, $t3
++# CHECK64-ASM: encoding: [0xed,0x39,0x59,0x38]
++amcas.w $t1, $t2, $t3
++
++# CHECK64-ASM-AND-OBJ: amcas.d $t1, $t2, $t3
++# CHECK64-ASM: encoding: [0xed,0xb9,0x59,0x38]
++amcas.d $t1, $t2, $t3
++
++# CHECK64-ASM-AND-OBJ: amcas_db.b $t1, $t2, $t3
++# CHECK64-ASM: encoding: [0xed,0x39,0x5a,0x38]
++amcas_db.b $t1, $t2, $t3
++
++# CHECK64-ASM-AND-OBJ: amcas_db.h $t1, $t2, $t3
++# CHECK64-ASM: encoding: [0xed,0xb9,0x5a,0x38]
++amcas_db.h $t1, $t2, $t3
++
++# CHECK64-ASM-AND-OBJ: amcas_db.w $t1, $t2, $t3
++# CHECK64-ASM: encoding: [0xed,0x39,0x5b,0x38]
++amcas_db.w $t1, $t2, $t3
++
++# CHECK64-ASM-AND-OBJ: amcas_db.d $t1, $t2, $t3
++# CHECK64-ASM: encoding: [0xed,0xb9,0x5b,0x38]
++amcas_db.d $t1, $t2, $t3
++
+ # CHECK64-ASM-AND-OBJ: ll.d $s2, $s4, 16
+ # CHECK64-ASM: encoding: [0x79,0x13,0x00,0x22]
+ ll.d $s2, $s4, 16
+@@ -193,5 +273,17 @@ ll.d $s2, $s4, 16
+ # CHECK64-ASM: encoding: [0x31,0xf6,0x00,0x23]
+ sc.d $t5, $t5, 244
+ 
++# CHECK64-ASM-AND-OBJ: sc.q $t7, $t2, $t5
++# CHECK64-ASM: encoding: [0x33,0x3a,0x57,0x38]
++sc.q $t7, $t2, $t5
++
++# CHECK64-ASM-AND-OBJ: llacq.d $t1, $t2
++# CHECK64-ASM: encoding: [0xcd,0x89,0x57,0x38]
++llacq.d $t1, $t2
++
++# CHECK64-ASM-AND-OBJ: screl.d $t1, $t2
++# CHECK64-ASM: encoding: [0xcd,0x8d,0x57,0x38]
++screl.d $t1, $t2
++
+ .endif
+ 
+diff --git a/llvm/test/MC/LoongArch/lasx/frecip.s b/llvm/test/MC/LoongArch/lasx/frecip.s
+index 1bb3ce02fb9c..e95b03a96eba 100644
+--- a/llvm/test/MC/LoongArch/lasx/frecip.s
++++ b/llvm/test/MC/LoongArch/lasx/frecip.s
+@@ -10,3 +10,11 @@ xvfrecip.s $xr3, $xr16
+ xvfrecip.d $xr17, $xr24
+ # CHECK-INST: xvfrecip.d $xr17, $xr24
+ # CHECK-ENCODING: encoding: [0x11,0xfb,0x9c,0x76]
++
++xvfrecipe.s $xr3, $xr16
++# CHECK-INST: xvfrecipe.s $xr3, $xr16
++# CHECK-ENCODING: encoding: [0x03,0x16,0x9d,0x76]
++
++xvfrecipe.d $xr17, $xr24
++# CHECK-INST: xvfrecipe.d $xr17, $xr24
++# CHECK-ENCODING: encoding: [0x11,0x1b,0x9d,0x76]
+diff --git a/llvm/test/MC/LoongArch/lasx/frsqrt.s b/llvm/test/MC/LoongArch/lasx/frsqrt.s
+index af96e10832df..d1048f9ff8f0 100644
+--- a/llvm/test/MC/LoongArch/lasx/frsqrt.s
++++ b/llvm/test/MC/LoongArch/lasx/frsqrt.s
+@@ -10,3 +10,11 @@ xvfrsqrt.s $xr31, $xr25
+ xvfrsqrt.d $xr14, $xr22
+ # CHECK-INST: xvfrsqrt.d $xr14, $xr22
+ # CHECK-ENCODING: encoding: [0xce,0x0a,0x9d,0x76]
++
++xvfrsqrte.s $xr31, $xr25
++# CHECK-INST: xvfrsqrte.s $xr31, $xr25
++# CHECK-ENCODING: encoding: [0x3f,0x27,0x9d,0x76]
++
++xvfrsqrte.d $xr14, $xr22
++# CHECK-INST: xvfrsqrte.d $xr14, $xr22
++# CHECK-ENCODING: encoding: [0xce,0x2a,0x9d,0x76]
+diff --git a/llvm/test/MC/LoongArch/lsx/frecip.s b/llvm/test/MC/LoongArch/lsx/frecip.s
+index d8c8278d1667..cd6d925e1470 100644
+--- a/llvm/test/MC/LoongArch/lsx/frecip.s
++++ b/llvm/test/MC/LoongArch/lsx/frecip.s
+@@ -10,3 +10,11 @@ vfrecip.s $vr29, $vr14
+ vfrecip.d $vr24, $vr9
+ # CHECK-INST: vfrecip.d $vr24, $vr9
+ # CHECK-ENCODING: encoding: [0x38,0xf9,0x9c,0x72]
++
++vfrecipe.s $vr29, $vr14
++# CHECK-INST: vfrecipe.s $vr29, $vr14
++# CHECK-ENCODING: encoding: [0xdd,0x15,0x9d,0x72]
++
++vfrecipe.d $vr24, $vr9
++# CHECK-INST: vfrecipe.d $vr24, $vr9
++# CHECK-ENCODING: encoding: [0x38,0x19,0x9d,0x72]
+diff --git a/llvm/test/MC/LoongArch/lsx/frsqrt.s b/llvm/test/MC/LoongArch/lsx/frsqrt.s
+index 68b0cc091b8a..d8b9fc3d0684 100644
+--- a/llvm/test/MC/LoongArch/lsx/frsqrt.s
++++ b/llvm/test/MC/LoongArch/lsx/frsqrt.s
+@@ -10,3 +10,11 @@ vfrsqrt.s $vr19, $vr30
+ vfrsqrt.d $vr1, $vr0
+ # CHECK-INST: vfrsqrt.d $vr1, $vr0
+ # CHECK-ENCODING: encoding: [0x01,0x08,0x9d,0x72]
++
++vfrsqrte.s $vr19, $vr30
++# CHECK-INST: vfrsqrte.s $vr19, $vr30
++# CHECK-ENCODING: encoding: [0xd3,0x27,0x9d,0x72]
++
++vfrsqrte.d $vr1, $vr0
++# CHECK-INST: vfrsqrte.d $vr1, $vr0
++# CHECK-ENCODING: encoding: [0x01,0x28,0x9d,0x72]
+-- 
+2.20.1
+
+
+From 57eaecf7bdb7a7502580076b365b4f70dde1185d Mon Sep 17 00:00:00 2001
+From: Ami-zhang <zhanglimin@loongson.cn>
+Date: Tue, 23 Jan 2024 14:24:58 +0800
+Subject: [PATCH 09/12] [LoongArch] Add definitions and feature 'frecipe' for
+ FP approximation intrinsics/builtins (#78962)
+
+This PR adds definitions and 'frecipe' feature for FP approximation
+intrinsics/builtins. In additions, this adds and complements relative
+testcases.
+
+(cherry picked from commit fcb8342a219ada8ec641790a4c8a9f969d7d64ee)
+---
+ llvm/include/llvm/IR/IntrinsicsLoongArch.td   | 13 ++++++++++
+ llvm/lib/Target/LoongArch/LoongArch.td        |  7 +++++
+ .../LoongArch/LoongArchFloat32InstrInfo.td    |  6 +++++
+ .../LoongArch/LoongArchFloat64InstrInfo.td    |  6 +++++
+ .../LoongArch/LoongArchLASXInstrInfo.td       | 10 +++++++
+ .../Target/LoongArch/LoongArchLSXInstrInfo.td | 10 +++++++
+ .../lib/Target/LoongArch/LoongArchSubtarget.h |  2 ++
+ .../LoongArch/intrinsic-frecipe-dbl.ll        | 26 +++++++++++++++++++
+ .../LoongArch/intrinsic-frecipe-flt.ll        | 26 +++++++++++++++++++
+ .../LoongArch/lasx/intrinsic-frecipe.ll       | 26 +++++++++++++++++++
+ .../LoongArch/lasx/intrinsic-frsqrte.ll       | 26 +++++++++++++++++++
+ .../LoongArch/lsx/intrinsic-frecipe.ll        | 26 +++++++++++++++++++
+ .../LoongArch/lsx/intrinsic-frsqrte.ll        | 26 +++++++++++++++++++
+ 13 files changed, 210 insertions(+)
+ create mode 100644 llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll
+ create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll
+
+diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+index 685deaec7709..9002076e7aec 100644
+--- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td
++++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+@@ -122,6 +122,15 @@ def int_loongarch_lddir_d : BaseInt<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+                                     [ImmArg<ArgIndex<1>>]>;
+ def int_loongarch_ldpte_d : BaseInt<[], [llvm_i64_ty, llvm_i64_ty],
+                                     [ImmArg<ArgIndex<1>>]>;
++
++def int_loongarch_frecipe_s : BaseInt<[llvm_float_ty], [llvm_float_ty],
++                                      [IntrNoMem]>;
++def int_loongarch_frecipe_d : BaseInt<[llvm_double_ty], [llvm_double_ty],
++                                      [IntrNoMem]>;
++def int_loongarch_frsqrte_s : BaseInt<[llvm_float_ty], [llvm_float_ty],
++                                      [IntrNoMem]>;
++def int_loongarch_frsqrte_d : BaseInt<[llvm_double_ty], [llvm_double_ty],
++                                      [IntrNoMem]>;
+ } // TargetPrefix = "loongarch"
+ 
+ /// Vector intrinsic
+@@ -527,10 +536,12 @@ foreach inst = ["vfmadd_d", "vfmsub_d", "vfnmadd_d", "vfnmsub_d"] in
+              [IntrNoMem]>;
+ 
+ foreach inst = ["vflogb_s", "vfsqrt_s", "vfrecip_s", "vfrsqrt_s", "vfrint_s",
++                "vfrecipe_s", "vfrsqrte_s",
+                 "vfrintrne_s", "vfrintrz_s", "vfrintrp_s", "vfrintrm_s"] in
+   def int_loongarch_lsx_#inst : VecInt<[llvm_v4f32_ty], [llvm_v4f32_ty],
+                                        [IntrNoMem]>;
+ foreach inst = ["vflogb_d", "vfsqrt_d", "vfrecip_d", "vfrsqrt_d", "vfrint_d",
++                "vfrecipe_d", "vfrsqrte_d",
+                 "vfrintrne_d", "vfrintrz_d", "vfrintrp_d", "vfrintrm_d"] in
+   def int_loongarch_lsx_#inst : VecInt<[llvm_v2f64_ty], [llvm_v2f64_ty],
+                                        [IntrNoMem]>;
+@@ -1044,10 +1055,12 @@ foreach inst = ["xvfmadd_d", "xvfmsub_d", "xvfnmadd_d", "xvfnmsub_d"] in
+              [IntrNoMem]>;
+ 
+ foreach inst = ["xvflogb_s", "xvfsqrt_s", "xvfrecip_s", "xvfrsqrt_s", "xvfrint_s",
++                "xvfrecipe_s", "xvfrsqrte_s",
+                 "xvfrintrne_s", "xvfrintrz_s", "xvfrintrp_s", "xvfrintrm_s"] in
+   def int_loongarch_lasx_#inst : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty],
+                                         [IntrNoMem]>;
+ foreach inst = ["xvflogb_d", "xvfsqrt_d", "xvfrecip_d", "xvfrsqrt_d", "xvfrint_d",
++                "xvfrecipe_d", "xvfrsqrte_d",
+                 "xvfrintrne_d", "xvfrintrz_d", "xvfrintrp_d", "xvfrintrm_d"] in
+   def int_loongarch_lasx_#inst : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty],
+                                         [IntrNoMem]>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
+index 2a4c991a43b0..5573e5415d26 100644
+--- a/llvm/lib/Target/LoongArch/LoongArch.td
++++ b/llvm/lib/Target/LoongArch/LoongArch.td
+@@ -110,6 +110,13 @@ def FeatureAutoVec
+     : SubtargetFeature<"auto-vec", "HasExpAutoVec", "true",
+                        "Experimental auto vectorization">;
+ 
++// Floating point approximation operation
++def FeatureFrecipe
++    : SubtargetFeature<"frecipe", "HasFrecipe", "true",
++                       "Support frecipe.{s/d} and frsqrte.{s/d} instructions.">;
++def HasFrecipe : Predicate<"Subtarget->hasFrecipe()">;
++
++
+ //===----------------------------------------------------------------------===//
+ // Registers, instruction descriptions ...
+ //===----------------------------------------------------------------------===//
+diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+index f30837912e75..e27896768818 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+@@ -281,6 +281,12 @@ def : Pat<(loongarch_ftint FPR32:$src), (FTINTRZ_W_S FPR32:$src)>;
+ // FP reciprocal operation
+ def : Pat<(fdiv fpimm1, FPR32:$src), (FRECIP_S $src)>;
+ 
++let Predicates = [HasFrecipe] in {
++// FP approximate reciprocal operation
++def : Pat<(int_loongarch_frecipe_s FPR32:$src), (FRECIPE_S FPR32:$src)>;
++def : Pat<(int_loongarch_frsqrte_s FPR32:$src), (FRSQRTE_S FPR32:$src)>;
++}
++
+ // fmadd.s: fj * fk + fa
+ def : Pat<(fma FPR32:$fj, FPR32:$fk, FPR32:$fa), (FMADD_S $fj, $fk, $fa)>;
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+index 0ea4c564b045..26bed67ac222 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+@@ -242,6 +242,12 @@ def : Pat<(f64 (fpextend FPR32:$src)), (FCVT_D_S FPR32:$src)>;
+ // FP reciprocal operation
+ def : Pat<(fdiv fpimm1, FPR64:$src), (FRECIP_D $src)>;
+ 
++let Predicates = [HasFrecipe] in {
++// FP approximate reciprocal operation
++def : Pat<(int_loongarch_frecipe_d FPR64:$src), (FRECIPE_D FPR64:$src)>;
++def : Pat<(int_loongarch_frsqrte_d FPR64:$src), (FRSQRTE_D FPR64:$src)>;
++}
++
+ // fmadd.d: fj * fk + fa
+ def : Pat<(fma FPR64:$fj, FPR64:$fk, FPR64:$fa), (FMADD_D $fj, $fk, $fa)>;
+ 
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+index 454915ac8c0a..6f1969bf8cae 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+@@ -2080,6 +2080,16 @@ foreach Inst = ["XVFLOGB_D", "XVFCLASS_D", "XVFSQRT_D", "XVFRECIP_D", "XVFRSQRT_
+   def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4f64 LASX256:$xj)),
+             (!cast<LAInst>(Inst) LASX256:$xj)>;
+ 
++// 256-Bit vector FP approximate reciprocal operation
++let Predicates = [HasFrecipe] in {
++foreach Inst = ["XVFRECIPE_S", "XVFRSQRTE_S"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v8f32 LASX256:$xj)),
++            (!cast<LAInst>(Inst) LASX256:$xj)>;
++foreach Inst = ["XVFRECIPE_D", "XVFRSQRTE_D"] in
++  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4f64 LASX256:$xj)),
++            (!cast<LAInst>(Inst) LASX256:$xj)>;
++}
++
+ def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm),
+           (XVPICKVE_W v8f32:$xj, (to_valid_timm timm:$imm))>;
+ def : Pat<(int_loongarch_lasx_xvpickve_d_f v4f64:$xj, timm:$imm),
+diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+index 6d60d7074ec3..0580683c3ce3 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
++++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+@@ -2195,6 +2195,16 @@ foreach Inst = ["VFLOGB_D", "VFCLASS_D", "VFSQRT_D", "VFRECIP_D", "VFRSQRT_D",
+   def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2f64 LSX128:$vj)),
+             (!cast<LAInst>(Inst) LSX128:$vj)>;
+ 
++// 128-Bit vector FP approximate reciprocal operation
++let Predicates = [HasFrecipe] in {
++foreach Inst = ["VFRECIPE_S", "VFRSQRTE_S"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v4f32 LSX128:$vj)),
++            (!cast<LAInst>(Inst) LSX128:$vj)>;
++foreach Inst = ["VFRECIPE_D", "VFRSQRTE_D"] in
++  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2f64 LSX128:$vj)),
++            (!cast<LAInst>(Inst) LSX128:$vj)>;
++}
++
+ // load
+ def : Pat<(int_loongarch_lsx_vld GPR:$rj, timm:$imm),
+           (VLD GPR:$rj, (to_valid_timm timm:$imm))>;
+diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+index 174e4cba8326..11c0b39e176e 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
++++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+@@ -45,6 +45,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
+   bool HasUAL = false;
+   bool HasLinkerRelax = false;
+   bool HasExpAutoVec = false;
++  bool HasFrecipe = false;
+   unsigned GRLen = 32;
+   MVT GRLenVT = MVT::i32;
+   LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown;
+@@ -104,6 +105,7 @@ public:
+   bool hasUAL() const { return HasUAL; }
+   bool hasLinkerRelax() const { return HasLinkerRelax; }
+   bool hasExpAutoVec() const { return HasExpAutoVec; }
++  bool hasFrecipe() const { return HasFrecipe; }
+   MVT getGRLenVT() const { return GRLenVT; }
+   unsigned getGRLen() const { return GRLen; }
+   LoongArchABI::ABI getTargetABI() const { return TargetABI; }
+diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll
+new file mode 100644
+index 000000000000..9f572500caa0
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll
+@@ -0,0 +1,26 @@
++; RUN: llc --mtriple=loongarch32 --mattr=+d,+frecipe < %s | FileCheck %s
++; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s
++
++declare double @llvm.loongarch.frecipe.d(double)
++
++define double @frecipe_d(double %a) {
++; CHECK-LABEL: frecipe_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    frecipe.d $fa0, $fa0
++; CHECK-NEXT:    ret
++entry:
++  %res = call double @llvm.loongarch.frecipe.d(double %a)
++  ret double %res
++}
++
++declare double @llvm.loongarch.frsqrte.d(double)
++
++define double @frsqrte_d(double %a) {
++; CHECK-LABEL: frsqrte_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    frsqrte.d $fa0, $fa0
++; CHECK-NEXT:    ret
++entry:
++  %res = call double @llvm.loongarch.frsqrte.d(double %a)
++  ret double %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll
+new file mode 100644
+index 000000000000..0b2029f2e44a
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll
+@@ -0,0 +1,26 @@
++; RUN: llc --mtriple=loongarch32 --mattr=+f,+frecipe < %s | FileCheck %s
++; RUN: llc --mtriple=loongarch64 --mattr=+f,+frecipe < %s | FileCheck %s
++
++declare float @llvm.loongarch.frecipe.s(float)
++
++define float @frecipe_s(float %a) {
++; CHECK-LABEL: frecipe_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    frecipe.s $fa0, $fa0
++; CHECK-NEXT:    ret
++entry:
++  %res = call float @llvm.loongarch.frecipe.s(float %a)
++  ret float %res
++}
++
++declare float @llvm.loongarch.frsqrte.s(float)
++
++define float @frsqrte_s(float %a) {
++; CHECK-LABEL: frsqrte_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    frsqrte.s $fa0, $fa0
++; CHECK-NEXT:    ret
++entry:
++  %res = call float @llvm.loongarch.frsqrte.s(float %a)
++  ret float %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll
+new file mode 100644
+index 000000000000..215436823af8
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float>)
++
++define <8 x float> @lasx_xvfrecipe_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrecipe_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrecipe.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float> %va)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double>)
++
++define <4 x double> @lasx_xvfrecipe_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrecipe_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrecipe.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double> %va)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll
+new file mode 100644
+index 000000000000..ad36c3aa5c29
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s
++
++declare <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float>)
++
++define <8 x float> @lasx_xvfrsqrte_s(<8 x float> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrsqrte_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrsqrte.s $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float> %va)
++  ret <8 x float> %res
++}
++
++declare <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double>)
++
++define <4 x double> @lasx_xvfrsqrte_d(<4 x double> %va) nounwind {
++; CHECK-LABEL: lasx_xvfrsqrte_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    xvfrsqrte.d $xr0, $xr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double> %va)
++  ret <4 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll
+new file mode 100644
+index 000000000000..1b7a97d9f972
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float>)
++
++define <4 x float> @lsx_vfrecipe_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vfrecipe_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrecipe.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float> %va)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double>)
++
++define <2 x double> @lsx_vfrecipe_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vfrecipe_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrecipe.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double> %va)
++  ret <2 x double> %res
++}
+diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll
+new file mode 100644
+index 000000000000..3cd6c78e87d7
+--- /dev/null
++++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll
+@@ -0,0 +1,26 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
++; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s
++
++declare <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float>)
++
++define <4 x float> @lsx_vfrsqrte_s(<4 x float> %va) nounwind {
++; CHECK-LABEL: lsx_vfrsqrte_s:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrsqrte.s $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float> %va)
++  ret <4 x float> %res
++}
++
++declare <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double>)
++
++define <2 x double> @lsx_vfrsqrte_d(<2 x double> %va) nounwind {
++; CHECK-LABEL: lsx_vfrsqrte_d:
++; CHECK:       # %bb.0: # %entry
++; CHECK-NEXT:    vfrsqrte.d $vr0, $vr0
++; CHECK-NEXT:    ret
++entry:
++  %res = call <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double> %va)
++  ret <2 x double> %res
++}
+-- 
+2.20.1
+
+
+From 6f9531b069971dc0f5c6b28bd6a6754c1b5fde72 Mon Sep 17 00:00:00 2001
+From: Ami-zhang <zhanglimin@loongson.cn>
+Date: Tue, 23 Jul 2024 14:03:28 +0800
+Subject: [PATCH 10/12] [LoongArch] Support -march=la64v1.0 and -march=la64v1.1
+ (#100057)
+
+The newly added strings `la64v1.0` and `la64v1.1` in `-march` are as
+described in LoongArch toolchains conventions (see [1]).
+
+The target-cpu/feature attributes are forwarded to compiler when
+specifying particular `-march` parameter. The default cpu `loongarch64`
+is returned when archname is `la64v1.0` or `la64v1.1`.
+
+In addition, this commit adds `la64v1.0`/`la64v1.1` to
+"__loongarch_arch" and adds definition for macro "__loongarch_frecipe".
+
+[1]: https://github.com/loongson/la-toolchain-conventions
+
+(cherry picked from commit 5a1b9896ad5a7dcd25a1cc7a4d3fd44155e4b22d)
+---
+ llvm/lib/TargetParser/LoongArchTargetParser.cpp | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+diff --git a/llvm/lib/TargetParser/LoongArchTargetParser.cpp b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
+index 772d24c5ce3d..8e86d18de2ad 100644
+--- a/llvm/lib/TargetParser/LoongArchTargetParser.cpp
++++ b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
+@@ -44,6 +44,17 @@ bool LoongArch::getArchFeatures(StringRef Arch,
+       return true;
+     }
+   }
++
++  if (Arch == "la64v1.0" || Arch == "la64v1.1") {
++    Features.push_back("+64bit");
++    Features.push_back("+d");
++    Features.push_back("+lsx");
++    Features.push_back("+ual");
++    if (Arch == "la64v1.1")
++      Features.push_back("+frecipe");
++    return true;
++  }
++
+   return false;
+ }
+ 
+-- 
+2.20.1
+
+
+From 6094875aa6aab1e28a096294783cada0243e95d5 Mon Sep 17 00:00:00 2001
+From: Ami-zhang <zhanglimin@loongson.cn>
+Date: Tue, 23 Jul 2024 15:14:20 +0800
+Subject: [PATCH 11/12] [LoongArch] Support la664 (#100068)
+
+A new ProcessorModel called `la664` is defined in LoongArch.td to
+support `-march/-mtune=la664`.
+
+(cherry picked from commit fcec298087dba0c83f6d0bbafd6cd934c42cbf82)
+---
+ llvm/include/llvm/TargetParser/LoongArchTargetParser.def | 2 ++
+ llvm/include/llvm/TargetParser/LoongArchTargetParser.h   | 3 +++
+ llvm/lib/Target/LoongArch/LoongArch.td                   | 7 +++++++
+ llvm/lib/TargetParser/Host.cpp                           | 2 ++
+ llvm/test/CodeGen/LoongArch/cpus.ll                      | 5 +++++
+ 5 files changed, 19 insertions(+)
+
+diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
+index b20d124953f8..101a48cbd539 100644
+--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
++++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
+@@ -10,6 +10,7 @@ LOONGARCH_FEATURE("+lasx", FK_LASX)
+ LOONGARCH_FEATURE("+lbt", FK_LBT)
+ LOONGARCH_FEATURE("+lvz", FK_LVZ)
+ LOONGARCH_FEATURE("+ual", FK_UAL)
++LOONGARCH_FEATURE("+frecipe", FK_FRECIPE)
+ 
+ #undef LOONGARCH_FEATURE
+ 
+@@ -19,5 +20,6 @@ LOONGARCH_FEATURE("+ual", FK_UAL)
+ 
+ LOONGARCH_ARCH("loongarch64", AK_LOONGARCH64, FK_64BIT | FK_FP32 | FK_FP64 | FK_UAL)
+ LOONGARCH_ARCH("la464", AK_LA464, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL)
++LOONGARCH_ARCH("la664", AK_LA664, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL | FK_FRECIPE)
+ 
+ #undef LOONGARCH_ARCH
+diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
+index 028844187584..c0bb15a5163b 100644
+--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
++++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
+@@ -46,6 +46,9 @@ enum FeatureKind : uint32_t {
+ 
+   // Allow memory accesses to be unaligned.
+   FK_UAL = 1 << 8,
++
++  // Floating-point approximate reciprocal instructions are available.
++  FK_FRECIPE = 1 << 9,
+ };
+ 
+ struct FeatureInfo {
+diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
+index 5573e5415d26..b5cd5bb0f8a4 100644
+--- a/llvm/lib/Target/LoongArch/LoongArch.td
++++ b/llvm/lib/Target/LoongArch/LoongArch.td
+@@ -147,6 +147,13 @@ def : ProcessorModel<"la464", NoSchedModel, [Feature64Bit,
+                                              FeatureExtLVZ,
+                                              FeatureExtLBT]>;
+ 
++def : ProcessorModel<"la664", NoSchedModel, [Feature64Bit,
++                                             FeatureUAL,
++                                             FeatureExtLASX,
++                                             FeatureExtLVZ,
++                                             FeatureExtLBT,
++                                             FeatureFrecipe]>;
++
+ //===----------------------------------------------------------------------===//
+ // Define the LoongArch target.
+ //===----------------------------------------------------------------------===//
+diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
+index 8b23be02edc0..87e3e0b434d5 100644
+--- a/llvm/lib/TargetParser/Host.cpp
++++ b/llvm/lib/TargetParser/Host.cpp
+@@ -1469,6 +1469,8 @@ StringRef sys::getHostCPUName() {
+   switch (processor_id & 0xf000) {
+   case 0xc000: // Loongson 64bit, 4-issue
+     return "la464";
++  case 0xd000: // Loongson 64bit, 6-issue
++    return "la664";
+   // TODO: Others.
+   default:
+     break;
+diff --git a/llvm/test/CodeGen/LoongArch/cpus.ll b/llvm/test/CodeGen/LoongArch/cpus.ll
+index 35945ae4de71..087cf887b813 100644
+--- a/llvm/test/CodeGen/LoongArch/cpus.ll
++++ b/llvm/test/CodeGen/LoongArch/cpus.ll
+@@ -3,6 +3,7 @@
+ 
+ ; RUN: llc < %s --mtriple=loongarch64 --mcpu=loongarch64 2>&1 | FileCheck %s
+ ; RUN: llc < %s --mtriple=loongarch64 --mcpu=la464 2>&1 | FileCheck %s
++; RUN: llc < %s --mtriple=loongarch64 --mcpu=la664 2>&1 | FileCheck %s
+ ; RUN: llc < %s --mtriple=loongarch64 2>&1 | FileCheck %s
+ 
+ ; CHECK-NOT: {{.*}} is not a recognized processor for this target
+@@ -18,3 +19,7 @@ define void @tune_cpu_loongarch64() "tune-cpu"="loongarch64" {
+ define void @tune_cpu_la464() "tune-cpu"="la464" {
+   ret void
+ }
++
++define void @tune_cpu_la664() "tune-cpu"="la664" {
++  ret void
++}
+-- 
+2.20.1
+
+
+From f06fec7597485a8d90aa81e3c65abea1bdeeb90b Mon Sep 17 00:00:00 2001
+From: Zhaoxin Yang <yangzhaoxin@loongson.cn>
+Date: Tue, 23 Jul 2024 15:19:00 +0800
+Subject: [PATCH 12/12] [LoongArch] Remove experimental `auto-vec` feature.
+ (#100070)
+
+Currently, automatic vectorization will be enabled with `-mlsx/-mlasx`
+enabled.
+
+(cherry picked from commit 89d1eb67342d75d1de8d210157fdeaeb6a4724b6)
+---
+ llvm/lib/Target/LoongArch/LoongArch.td                     | 4 ----
+ llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp | 2 --
+ llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll   | 2 +-
+ 3 files changed, 1 insertion(+), 7 deletions(-)
+
+diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
+index b5cd5bb0f8a4..5f85cace71af 100644
+--- a/llvm/lib/Target/LoongArch/LoongArch.td
++++ b/llvm/lib/Target/LoongArch/LoongArch.td
+@@ -105,10 +105,6 @@ def FeatureUAL
+ def FeatureRelax
+     : SubtargetFeature<"relax", "HasLinkerRelax", "true",
+                        "Enable Linker relaxation">;
+-// Experimental auto vectorization
+-def FeatureAutoVec
+-    : SubtargetFeature<"auto-vec", "HasExpAutoVec", "true",
+-                       "Experimental auto vectorization">;
+ 
+ // Floating point approximation operation
+ def FeatureFrecipe
+diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+index d47dded9ea6e..7961bb141e64 100644
+--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
++++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+@@ -26,8 +26,6 @@ TypeSize LoongArchTTIImpl::getRegisterBitWidth(
+   case TargetTransformInfo::RGK_Scalar:
+     return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
+   case TargetTransformInfo::RGK_FixedWidthVector:
+-    if (!ST->hasExpAutoVec())
+-      return DefSize;
+     if (ST->hasExtLASX())
+       return TypeSize::getFixed(256);
+     if (ST->hasExtLSX())
+diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
+index a8ac2411dd82..6ab300859f9d 100644
+--- a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
++++ b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
+@@ -1,5 +1,5 @@
+ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+-; RUN: opt < %s -passes=loop-vectorize -mtriple loongarch64-linux-gnu -mattr=+lasx,+auto-vec -S | FileCheck %s
++; RUN: opt < %s -passes=loop-vectorize -mtriple loongarch64-linux-gnu -mattr=+lasx -S | FileCheck %s
+ 
+ ;; This is a collection of tests whose only purpose is to show changes in the
+ ;; default configuration.  Please keep these tests minimal - if you're testing
+-- 
+2.20.1
+
diff --git a/0025-Backport-Simple-check-to-ignore-Inline-asm-fwait-insertion.patch b/0025-Backport-Simple-check-to-ignore-Inline-asm-fwait-insertion.patch
new file mode 100644
index 0000000..4d98b01
--- /dev/null
+++ b/0025-Backport-Simple-check-to-ignore-Inline-asm-fwait-insertion.patch
@@ -0,0 +1,30 @@
+From cf9d549f2c40d548587f8d2d3cda0d32f13c9256 Mon Sep 17 00:00:00 2001
+From: Temperatureblock <102174059+Temperature-block@users.noreply.github.com>
+Date: Mon, 12 Aug 2024 20:06:58 +0530
+Subject: [PATCH] Simple check to ignore Inline asm fwait insertion (#101686)
+
+Just a simple check to ignore Inline asm fwait insertion
+
+Fixes #101613
+---
+ llvm/lib/Target/X86/X86InstrInfo.cpp | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
+index 10a0ccdcb023..e615fa09608c 100644
+--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
++++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
+@@ -2947,6 +2947,11 @@ static bool isX87Reg(unsigned Reg) {
+ 
+ /// check if the instruction is X87 instruction
+ bool X86::isX87Instruction(MachineInstr &MI) {
++  // Call and inlineasm defs X87 register, so we special case it here because
++  // otherwise calls are incorrectly flagged as x87 instructions
++  // as a result.
++  if (MI.isInlineAsm())
++    return false;
+   for (const MachineOperand &MO : MI.operands()) {
+     if (!MO.isReg())
+       continue;
+-- 
+Gitee
diff --git a/0026-Add-arch-restriction-for-BiSheng-Autotuner.patch b/0026-Add-arch-restriction-for-BiSheng-Autotuner.patch
new file mode 100644
index 0000000..50c6470
--- /dev/null
+++ b/0026-Add-arch-restriction-for-BiSheng-Autotuner.patch
@@ -0,0 +1,24 @@
+From 2513e90fd317bbe5854a06213e43cdf7029c3ee2 Mon Sep 17 00:00:00 2001
+From: liyunfei <liyunfei33@huawei.com>
+Date: Tue, 5 Nov 2024 18:18:19 +0800
+Subject: [PATCH] Add arch restriction for BiSheng Autotuner
+
+BiSheng Autotuner only support x86_64 and aarch64 temporarily.
+
+Signed-off-by: liyunfei <liyunfei33@huawei.com>
+---
+ llvm/test/AutoTuning/lit.local.cfg | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/llvm/test/AutoTuning/lit.local.cfg b/llvm/test/AutoTuning/lit.local.cfg
+index 13b4927257ab..c48c2c9eab6f 100644
+--- a/llvm/test/AutoTuning/lit.local.cfg
++++ b/llvm/test/AutoTuning/lit.local.cfg
+@@ -1,2 +1,4 @@
+ if not config.enable_enable_autotuner:
+     config.unsupported = True
++if config.host_arch not in ["x86", "X86", 'x86_64', 'aarch64']:
++    config.unsupported = True
+\ No newline at end of file
+-- 
+Gitee
diff --git a/0027-AArch64-Delete-hip09-macro.patch b/0027-AArch64-Delete-hip09-macro.patch
new file mode 100644
index 0000000..4a3e8d3
--- /dev/null
+++ b/0027-AArch64-Delete-hip09-macro.patch
@@ -0,0 +1,514 @@
+From 42b0d16ab1ced5720e017fa9f6059c32489ab1bd Mon Sep 17 00:00:00 2001
+From: xiajingze <xiajingze1@huawei.com>
+Date: Wed, 9 Oct 2024 17:13:49 +0800
+Subject: [PATCH] [AArch64] Delete hip09 macro
+
+Signed-off-by: xiajingze <xiajingze1@huawei.com>
+---
+ llvm/cmake/modules/HandleLLVMOptions.cmake    |  8 --
+ .../llvm/TargetParser/AArch64TargetParser.h   |  2 -
+ llvm/lib/Target/AArch64/AArch64.td            |  8 --
+ .../lib/Target/AArch64/AArch64MacroFusion.cpp |  8 --
+ llvm/lib/Target/AArch64/AArch64Subtarget.cpp  |  2 -
+ llvm/lib/Target/AArch64/AArch64Subtarget.h    |  6 --
+ llvm/lib/Target/CMakeLists.txt                |  4 -
+ llvm/lib/TargetParser/Host.cpp                |  2 -
+ llvm/test/CodeGen/AArch64/cpus-hip09.ll       | 11 ---
+ llvm/test/CodeGen/AArch64/cpus.ll             |  1 +
+ .../CodeGen/AArch64/macro-fusion-mvnclz.mir   |  1 -
+ .../AArch64/misched-fusion-lit-hip09.ll       | 73 --------------
+ .../CodeGen/AArch64/misched-fusion-lit.ll     |  7 ++
+ llvm/test/CodeGen/AArch64/remat-hip09.ll      | 18 ----
+ llvm/test/CodeGen/AArch64/remat.ll            |  1 +
+ llvm/test/lit.site.cfg.py.in                  |  4 -
+ llvm/unittests/TargetParser/Host.cpp          |  2 -
+ .../TargetParser/TargetParserTest.cpp         |  6 --
+ 18 files changed, 9 insertions(+), 155 deletions(-)
+ delete mode 100644 llvm/test/CodeGen/AArch64/cpus-hip09.ll
+ delete mode 100644 llvm/test/CodeGen/AArch64/misched-fusion-lit-hip09.ll
+ delete mode 100644 llvm/test/CodeGen/AArch64/remat-hip09.ll
+
+diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
+index 74e68e25d85c..8be5d4ba52c2 100644
+--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
++++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
+@@ -112,14 +112,6 @@ else()
+   set(LLVM_ENABLE_AUTOTUNER 0)
+ endif()
+ 
+-option(LLVM_ENABLE_AARCH64_HIP09 "Enable HIP09 Processor" ON)
+-if(LLVM_ENABLE_AARCH64_HIP09)
+-  set(LLVM_ENABLE_AARCH64_HIP09 1)
+-  add_definitions( -DENABLE_AARCH64_HIP09 )
+-else()
+-  set(LLVM_ENABLE_AARCH64_HIP09 0)
+-endif()
+-
+ if(LLVM_ENABLE_EXPENSIVE_CHECKS)
+   add_compile_definitions(EXPENSIVE_CHECKS)
+ 
+diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+index 07cd2fcbb68d..8b25cce0abdc 100644
+--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
++++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+@@ -542,13 +542,11 @@ inline constexpr CpuInfo CpuInfos[] = {
+      (AArch64::AEK_FP16 | AArch64::AEK_RAND | AArch64::AEK_SM4 |
+       AArch64::AEK_SHA3 | AArch64::AEK_SHA2 | AArch64::AEK_AES |
+       AArch64::AEK_MTE | AArch64::AEK_SB | AArch64::AEK_SSBS)},
+-#if defined(ENABLE_AARCH64_HIP09)
+     {"hip09", ARMV8_5A,
+      (AArch64::AEK_AES | AArch64::AEK_SM4 | AArch64::AEK_SHA2 |
+       AArch64::AEK_SHA3 | AArch64::AEK_FP16 | AArch64::AEK_PROFILE |
+       AArch64::AEK_FP16FML | AArch64::AEK_SVE | AArch64::AEK_I8MM |
+       AArch64::AEK_F32MM | AArch64::AEK_F64MM | AArch64::AEK_BF16)},
+-#endif
+ };
+ 
+ // An alias for a CPU.
+diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
+index c8bfd770f55f..fdb931a0fe6c 100644
+--- a/llvm/lib/Target/AArch64/AArch64.td
++++ b/llvm/lib/Target/AArch64/AArch64.td
+@@ -296,11 +296,9 @@ def FeatureFuseAddSub2RegAndConstOne : SubtargetFeature<
+    "fuse-addsub-2reg-const1", "HasFuseAddSub2RegAndConstOne", "true",
+    "CPU fuses (a + b + 1) and (a - b - 1)">;
+ 
+-#ifdef ENABLE_AARCH64_HIP09
+ def FeatureFuseMvnClz : SubtargetFeature<
+     "fuse-mvn-clz", "HasFuseMvnClz", "true",
+     "CPU fuses mvn+clz operations">;
+-#endif
+ 
+ def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
+     "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
+@@ -1211,7 +1209,6 @@ def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110",
+                                   FeatureFuseAES,
+                                   FeaturePostRAScheduler]>;
+ 
+-#ifdef ENABLE_AARCH64_HIP09
+ def TuneHIP09 : SubtargetFeature<"hip09", "ARMProcFamily", "HIP09",
+                                    "HiSilicon HIP-09 processors", [
+                                    FeatureCustomCheapAsMoveHandling,
+@@ -1224,7 +1221,6 @@ def TuneHIP09 : SubtargetFeature<"hip09", "ARMProcFamily", "HIP09",
+                                    FeatureFuseLiterals,
+                                    FeatureFuseMvnClz,
+                                    FeaturePostRAScheduler]>;
+-#endif
+ 
+ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
+                                    "Ampere Computing Ampere-1 processors", [
+@@ -1380,14 +1376,12 @@ def ProcessorFeatures {
+   list<SubtargetFeature> TSV110 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
+                                    FeatureNEON, FeaturePerfMon, FeatureSPE,
+                                    FeatureFullFP16, FeatureFP16FML, FeatureDotProd];
+-#ifdef ENABLE_AARCH64_HIP09
+   list<SubtargetFeature> HIP09 = [HasV8_5aOps, FeatureBF16, FeatureCrypto, FeatureFPARMv8,
+                                   FeatureMatMulInt8, FeatureMatMulFP32, FeatureMatMulFP64,
+                                   FeatureNEON, FeaturePerfMon, FeatureRandGen, FeatureSPE,
+                                   FeatureFullFP16, FeatureFP16FML, FeatureDotProd,
+                                   FeatureJS, FeatureComplxNum, FeatureSHA3, FeatureSM4,
+                                   FeatureSVE];
+-#endif
+   list<SubtargetFeature> Ampere1 = [HasV8_6aOps, FeatureNEON, FeaturePerfMon,
+                                     FeatureSSBS, FeatureRandGen, FeatureSB,
+                                     FeatureSHA2, FeatureSHA3, FeatureAES];
+@@ -1497,11 +1491,9 @@ def : ProcessorModel<"thunderx3t110", ThunderX3T110Model,
+ // HiSilicon Processors.
+ def : ProcessorModel<"tsv110", TSV110Model, ProcessorFeatures.TSV110,
+                      [TuneTSV110]>;
+-#ifdef ENABLE_AARCH64_HIP09
+ // FIXME: HiSilicon HIP09 is currently modeled as a Cortex-A57.
+ def : ProcessorModel<"hip09", CortexA57Model, ProcessorFeatures.HIP09,
+                      [TuneHIP09]>;
+-#endif
+ 
+ // Support cyclone as an alias for apple-a7 so we can still LTO old bitcode.
+ def : ProcessorModel<"cyclone", CycloneModel, ProcessorFeatures.AppleA7,
+diff --git a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
+index 4963ec350db2..44daa06468c5 100644
+--- a/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
++++ b/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
+@@ -51,12 +51,10 @@ static bool isArithmeticBccPair(const MachineInstr *FirstMI,
+   case AArch64::SUBSXrr:
+   case AArch64::BICSWrr:
+   case AArch64::BICSXrr:
+-#if defined(ENABLE_AARCH64_HIP09)
+   case AArch64::ADCSWr:
+   case AArch64::ADCSXr:
+   case AArch64::SBCSWr:
+   case AArch64::SBCSXr:
+-#endif
+     return true;
+   case AArch64::ADDSWrs:
+   case AArch64::ADDSXrs:
+@@ -189,7 +187,6 @@ static bool isLiteralsPair(const MachineInstr *FirstMI,
+       SecondMI.getOperand(3).getImm() == 16))
+     return true;
+ 
+-#if defined(ENABLE_AARCH64_HIP09)
+   // 32 bit immediate.
+   if ((FirstMI == nullptr || FirstMI->getOpcode() == AArch64::MOVNWi) &&
+       (SecondMI.getOpcode() == AArch64::MOVKWi &&
+@@ -201,7 +198,6 @@ static bool isLiteralsPair(const MachineInstr *FirstMI,
+       (SecondMI.getOpcode() == AArch64::MOVKWi &&
+        SecondMI.getOperand(3).getImm() == 16))
+     return true;
+-#endif
+ 
+   // Upper half of 64 bit immediate.
+   if ((FirstMI == nullptr ||
+@@ -457,7 +453,6 @@ static bool isAddSub2RegAndConstOnePair(const MachineInstr *FirstMI,
+   return false;
+ }
+ 
+-#if defined(ENABLE_AARCH64_HIP09)
+ static bool isMvnClzPair(const MachineInstr *FirstMI,
+                          const MachineInstr &SecondMI) {
+   // HIP09 supports fusion of MVN + CLZ.
+@@ -486,7 +481,6 @@ static bool isMvnClzPair(const MachineInstr *FirstMI,
+ 
+   return false;
+ }
+-#endif
+ 
+ /// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+ /// together. Given SecondMI, when FirstMI is unspecified, then check if
+@@ -523,10 +517,8 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+   if (ST.hasFuseAddSub2RegAndConstOne() &&
+       isAddSub2RegAndConstOnePair(FirstMI, SecondMI))
+     return true;
+-#if defined(ENABLE_AARCH64_HIP09)
+   if (ST.hasFuseMvnClz() && isMvnClzPair(FirstMI, SecondMI))
+     return true;
+-#endif
+ 
+   return false;
+ }
+diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+index ddf22364c78e..1aff7e30a0cf 100644
+--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
++++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+@@ -266,7 +266,6 @@ void AArch64Subtarget::initializeProperties() {
+     PrefFunctionAlignment = Align(16);
+     PrefLoopAlignment = Align(4);
+     break;
+-#if defined(ENABLE_AARCH64_HIP09)
+   case HIP09:
+     CacheLineSize = 64;
+     PrefFunctionAlignment = Align(16);
+@@ -274,7 +273,6 @@ void AArch64Subtarget::initializeProperties() {
+     VScaleForTuning = 2;
+     DefaultSVETFOpts = TailFoldingOpts::Simple;
+     break;
+-#endif
+   case ThunderX3T110:
+     CacheLineSize = 64;
+     PrefFunctionAlignment = Align(16);
+diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
+index 5f481f4f976a..8a1cebe96894 100644
+--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
++++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
+@@ -88,9 +88,7 @@ public:
+     ThunderXT88,
+     ThunderX3T110,
+     TSV110,
+-#if defined(ENABLE_AARCH64_HIP09)
+     HIP09
+-#endif
+   };
+ 
+ protected:
+@@ -242,11 +240,7 @@ public:
+   bool hasFusion() const {
+     return hasArithmeticBccFusion() || hasArithmeticCbzFusion() ||
+            hasFuseAES() || hasFuseArithmeticLogic() || hasFuseCCSelect() ||
+-#if defined(ENABLE_AARCH64_HIP09)
+            hasFuseAdrpAdd() || hasFuseLiterals() || hasFuseMvnClz();
+-#else
+-           hasFuseAdrpAdd() || hasFuseLiterals();
+-#endif
+   }
+ 
+   unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
+diff --git a/llvm/lib/Target/CMakeLists.txt b/llvm/lib/Target/CMakeLists.txt
+index 501ce1f2fe53..2739233f9ccb 100644
+--- a/llvm/lib/Target/CMakeLists.txt
++++ b/llvm/lib/Target/CMakeLists.txt
+@@ -2,10 +2,6 @@ list(APPEND LLVM_COMMON_DEPENDS intrinsics_gen)
+ 
+ list(APPEND LLVM_TABLEGEN_FLAGS -I ${LLVM_MAIN_SRC_DIR}/lib/Target)
+ 
+-if(LLVM_ENABLE_AARCH64_HIP09)
+-  list(APPEND LLVM_TABLEGEN_FLAGS "-DENABLE_AARCH64_HIP09")
+-endif()
+-
+ add_llvm_component_library(LLVMTarget
+   Target.cpp
+   TargetIntrinsicInfo.cpp
+diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
+index 8b23be02edc0..8b1191a5b442 100644
+--- a/llvm/lib/TargetParser/Host.cpp
++++ b/llvm/lib/TargetParser/Host.cpp
+@@ -257,9 +257,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
+     // contents are specified in the various processor manuals.
+     return StringSwitch<const char *>(Part)
+       .Case("0xd01", "tsv110")
+-#if defined(ENABLE_AARCH64_HIP09)
+       .Case("0xd02", "hip09")
+-#endif
+       .Default("generic");
+ 
+   if (Implementer == "0x51") // Qualcomm Technologies, Inc.
+diff --git a/llvm/test/CodeGen/AArch64/cpus-hip09.ll b/llvm/test/CodeGen/AArch64/cpus-hip09.ll
+deleted file mode 100644
+index dcf32e4dca89..000000000000
+--- a/llvm/test/CodeGen/AArch64/cpus-hip09.ll
++++ /dev/null
+@@ -1,11 +0,0 @@
+-; REQUIRES: enable_enable_aarch64_hip09
+-; This tests that llc accepts all valid AArch64 CPUs
+-
+-; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=hip09 2>&1 | FileCheck %s
+-
+-; CHECK-NOT: {{.*}}  is not a recognized processor for this target
+-; INVALID: {{.*}}  is not a recognized processor for this target
+-
+-define i32 @f(i64 %z) {
+-	ret i32 0
+-}
+diff --git a/llvm/test/CodeGen/AArch64/cpus.ll b/llvm/test/CodeGen/AArch64/cpus.ll
+index b24866064efa..56772f6c6049 100644
+--- a/llvm/test/CodeGen/AArch64/cpus.ll
++++ b/llvm/test/CodeGen/AArch64/cpus.ll
+@@ -33,6 +33,7 @@
+ ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=thunderx2t99 2>&1 | FileCheck %s
+ ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=thunderx3t110 2>&1 | FileCheck %s
+ ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=tsv110 2>&1 | FileCheck %s
++; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=hip09 2>&1 | FileCheck %s
+ ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=apple-latest 2>&1 | FileCheck %s
+ ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=a64fx 2>&1 | FileCheck %s
+ ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1 2>&1 | FileCheck %s
+diff --git a/llvm/test/CodeGen/AArch64/macro-fusion-mvnclz.mir b/llvm/test/CodeGen/AArch64/macro-fusion-mvnclz.mir
+index 64bf159370f9..26ba76ef0af5 100644
+--- a/llvm/test/CodeGen/AArch64/macro-fusion-mvnclz.mir
++++ b/llvm/test/CodeGen/AArch64/macro-fusion-mvnclz.mir
+@@ -1,4 +1,3 @@
+-# REQUIRES: enable_enable_aarch64_hip09
+ # RUN: llc -o - %s -mtriple=aarch64-- -mattr=+fuse-mvn-clz -run-pass postmisched | FileCheck %s --check-prefixes=CHECK,FUSION
+ # RUN: llc -o - %s -mtriple=aarch64-- -mattr=-fuse-mvn-clz -run-pass postmisched | FileCheck %s --check-prefixes=CHECK,NOFUSION
+ ---
+diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-lit-hip09.ll b/llvm/test/CodeGen/AArch64/misched-fusion-lit-hip09.ll
+deleted file mode 100644
+index d67fa5b4374c..000000000000
+--- a/llvm/test/CodeGen/AArch64/misched-fusion-lit-hip09.ll
++++ /dev/null
+@@ -1,73 +0,0 @@
+-; REQUIRES: enable_enable_aarch64_hip09
+-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=hip09           | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSE-HIP09
+-
+-@g = common local_unnamed_addr global ptr null, align 8
+-
+-define dso_local ptr @litp(i32 %a, i32 %b) {
+-entry:
+-  %add = add nsw i32 %b, %a
+-  %idx.ext = sext i32 %add to i64
+-  %add.ptr = getelementptr i8, ptr @litp, i64 %idx.ext
+-  store ptr %add.ptr, ptr @g, align 8
+-  ret ptr %add.ptr
+-
+-; CHECK-LABEL: litp:
+-; CHECK: adrp [[R:x[0-9]+]], litp
+-; CHECKFUSE-NEXT: add {{x[0-9]+}}, [[R]], :lo12:litp
+-}
+-
+-define dso_local ptr @litp_tune_generic(i32 %a, i32 %b) "tune-cpu"="generic" {
+-entry:
+-  %add = add nsw i32 %b, %a
+-  %idx.ext = sext i32 %add to i64
+-  %add.ptr = getelementptr i8, ptr @litp_tune_generic, i64 %idx.ext
+-  store ptr %add.ptr, ptr @g, align 8
+-  ret ptr %add.ptr
+-
+-; CHECK-LABEL: litp_tune_generic:
+-; CHECK:         adrp [[R:x[0-9]+]], litp_tune_generic
+-; CHECK-NEXT:    add {{x[0-9]+}}, [[R]], :lo12:litp_tune_generic
+-}
+-
+-define dso_local i32 @liti(i32 %a, i32 %b) {
+-entry:
+-  %add = add i32 %a, -262095121
+-  %add1 = add i32 %add, %b
+-  ret i32 %add1
+-
+-; CHECK-LABEL: liti:
+-; CHECK: mov [[R:w[0-9]+]], {{#[0-9]+}}
+-; CHECKDONT-NEXT: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+-; CHECKFUSE-NEXT: movk [[R]], {{#[0-9]+}}, lsl #16
+-; CHECKFUSE-HIP09: movk [[R]], {{#[0-9]+}}, lsl #16
+-}
+-
+-; Function Attrs: norecurse nounwind readnone
+-define dso_local i64 @litl(i64 %a, i64 %b) {
+-entry:
+-  %add = add i64 %a, 2208998440489107183
+-  %add1 = add i64 %add, %b
+-  ret i64 %add1
+-
+-; CHECK-LABEL: litl:
+-; CHECK: mov [[R:x[0-9]+]], {{#[0-9]+}}
+-; CHECKDONT-NEXT: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+-; CHECK-NEXT: movk [[R]], {{#[0-9]+}}, lsl #16
+-; CHECK: movk [[R]], {{#[0-9]+}}, lsl #32
+-; CHECK-NEXT: movk [[R]], {{#[0-9]+}}, lsl #48
+-}
+-
+-; Function Attrs: norecurse nounwind readnone
+-define dso_local double @litf() {
+-entry:
+-  ret double 0x400921FB54442D18
+-
+-; CHECK-LABEL: litf:
+-; CHECK-DONT:      adrp [[ADDR:x[0-9]+]], [[CSTLABEL:.LCP.*]]
+-; CHECK-DONT-NEXT: ldr  {{d[0-9]+}}, {{[[]}}[[ADDR]], :lo12:[[CSTLABEL]]{{[]]}}
+-; CHECKFUSE-HIP09:    mov  [[R:x[0-9]+]], #11544
+-; CHECKFUSE-HIP09:    movk [[R]], #21572, lsl #16
+-; CHECKFUSE-HIP09:    movk [[R]], #8699, lsl #32
+-; CHECKFUSE-HIP09:    movk [[R]], #16393, lsl #48
+-; CHECKFUSE-HIP09:    fmov {{d[0-9]+}}, [[R]]
+-}
+diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-lit.ll b/llvm/test/CodeGen/AArch64/misched-fusion-lit.ll
+index ad244d30df11..67cc7aa503b6 100644
+--- a/llvm/test/CodeGen/AArch64/misched-fusion-lit.ll
++++ b/llvm/test/CodeGen/AArch64/misched-fusion-lit.ll
+@@ -7,6 +7,7 @@
+ ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m4       | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSE
+ ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m5       | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSE
+ ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=neoverse-n1     | FileCheck %s --check-prefix=CHECKFUSE-NEOVERSE
++; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=hip09           | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSE-HIP09
+ 
+ @g = common local_unnamed_addr global ptr null, align 8
+ 
+@@ -59,6 +60,7 @@ entry:
+ ; CHECK: mov [[R:w[0-9]+]], {{#[0-9]+}}
+ ; CHECKDONT-NEXT: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ ; CHECKFUSE-NEXT: movk [[R]], {{#[0-9]+}}, lsl #16
++; CHECKFUSE-HIP09: movk [[R]], {{#[0-9]+}}, lsl #16
+ }
+ 
+ ; Function Attrs: norecurse nounwind readnone
+@@ -89,4 +91,9 @@ entry:
+ ; CHECK-FUSE:      movk [[R]], #8699, lsl #32
+ ; CHECK-FUSE:      movk [[R]], #16393, lsl #48
+ ; CHECK-FUSE:      fmov {{d[0-9]+}}, [[R]]
++; CHECKFUSE-HIP09:    mov  [[R:x[0-9]+]], #11544
++; CHECKFUSE-HIP09:    movk [[R]], #21572, lsl #16
++; CHECKFUSE-HIP09:    movk [[R]], #8699, lsl #32
++; CHECKFUSE-HIP09:    movk [[R]], #16393, lsl #48
++; CHECKFUSE-HIP09:    fmov {{d[0-9]+}}, [[R]]
+ }
+diff --git a/llvm/test/CodeGen/AArch64/remat-hip09.ll b/llvm/test/CodeGen/AArch64/remat-hip09.ll
+deleted file mode 100644
+index aec0d18ae73f..000000000000
+--- a/llvm/test/CodeGen/AArch64/remat-hip09.ll
++++ /dev/null
+@@ -1,18 +0,0 @@
+-; REQUIRES: enable_enable_aarch64_hip09
+-; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=hip09 -o - %s | FileCheck %s
+-
+-%X = type { i64, i64, i64 }
+-declare void @f(ptr)
+-define void @t() {
+-entry:
+-  %tmp = alloca %X
+-  call void @f(ptr %tmp)
+-; CHECK: add x0, sp, #8
+-; CHECK-NOT: mov
+-; CHECK-NEXT: bl f
+-  call void @f(ptr %tmp)
+-; CHECK: add x0, sp, #8
+-; CHECK-NOT: mov
+-; CHECK-NEXT: bl f
+-  ret void
+-}
+diff --git a/llvm/test/CodeGen/AArch64/remat.ll b/llvm/test/CodeGen/AArch64/remat.ll
+index 483c4d71ee21..fa039246c7f5 100644
+--- a/llvm/test/CodeGen/AArch64/remat.ll
++++ b/llvm/test/CodeGen/AArch64/remat.ll
+@@ -22,6 +22,7 @@
+ ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=kryo -o - %s | FileCheck %s
+ ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=thunderx2t99 -o - %s | FileCheck %s
+ ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=tsv110 -o - %s | FileCheck %s
++; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=hip09 -o - %s | FileCheck %s
+ ; RUN: llc -mtriple=aarch64-linux-gnuabi -mattr=+custom-cheap-as-move -o - %s | FileCheck %s
+ ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=thunderx3t110 -o - %s | FileCheck %s
+ ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=ampere1 -o - %s | FileCheck %s
+diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
+index 6145a514f008..20c1ecca1d43 100644
+--- a/llvm/test/lit.site.cfg.py.in
++++ b/llvm/test/lit.site.cfg.py.in
+@@ -63,14 +63,10 @@ config.dxil_tests = @LLVM_INCLUDE_DXIL_TESTS@
+ config.have_llvm_driver = @LLVM_TOOL_LLVM_DRIVER_BUILD@
+ config.use_classic_flang = @LLVM_ENABLE_CLASSIC_FLANG@
+ config.enable_enable_autotuner = @LLVM_ENABLE_AUTOTUNER@
+-config.enable_enable_aarch64_hip09 = @LLVM_ENABLE_AARCH64_HIP09@
+ 
+ import lit.llvm
+ lit.llvm.initialize(lit_config, config)
+ 
+-if config.enable_enable_aarch64_hip09:
+-    config.available_features.add("enable_enable_aarch64_hip09")
+-
+ # Let the main config do the real work.
+ lit_config.load_config(
+     config, os.path.join(config.llvm_src_root, "test/lit.cfg.py"))
+diff --git a/llvm/unittests/TargetParser/Host.cpp b/llvm/unittests/TargetParser/Host.cpp
+index 4b4c81514896..cfc41486b173 100644
+--- a/llvm/unittests/TargetParser/Host.cpp
++++ b/llvm/unittests/TargetParser/Host.cpp
+@@ -250,11 +250,9 @@ CPU part	: 0x0a1
+   EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x48\n"
+                                               "CPU part        : 0xd01"),
+             "tsv110");
+-#if defined(ENABLE_AARCH64_HIP09)
+   EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x48\n"
+                                               "CPU part        : 0xd02"),
+             "hip09");
+-#endif
+ 
+   // Verify A64FX.
+   const std::string A64FXProcCpuInfo = R"(
+diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
+index 94e0047e567b..daa38474004e 100644
+--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
++++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
+@@ -1421,7 +1421,6 @@ INSTANTIATE_TEST_SUITE_P(
+                              AArch64::AEK_PROFILE | AArch64::AEK_FP16 |
+                              AArch64::AEK_FP16FML | AArch64::AEK_DOTPROD,
+                          "8.2-A"),
+-#if defined(ENABLE_AARCH64_HIP09)
+         ARMCPUTestParams(
+             "hip09", "armv8.5-a", "crypto-neon-fp-armv8",
+             AArch64::AEK_CRC | AArch64::AEK_FP | AArch64::AEK_SIMD |
+@@ -1432,7 +1431,6 @@ INSTANTIATE_TEST_SUITE_P(
+                 AArch64::AEK_FP16FML | AArch64::AEK_SVE | AArch64::AEK_I8MM |
+                 AArch64::AEK_F32MM | AArch64::AEK_F64MM | AArch64::AEK_BF16,
+             "8.5-A"),
+-#endif
+         ARMCPUTestParams("a64fx", "armv8.2-a", "crypto-neon-fp-armv8",
+                          AArch64::AEK_CRC | AArch64::AEK_AES |
+                              AArch64::AEK_SHA2 | AArch64::AEK_FP |
+@@ -1449,11 +1447,7 @@ INSTANTIATE_TEST_SUITE_P(
+                          "8.2-A")));
+ 
+ // Note: number of CPUs includes aliases.
+-#if defined(ENABLE_AARCH64_HIP09)
+ static constexpr unsigned NumAArch64CPUArchs = 63;
+-#else
+-static constexpr unsigned NumAArch64CPUArchs = 62;
+-#endif
+ 
+ TEST(TargetParserTest, testAArch64CPUArchList) {
+   SmallVector<StringRef, NumAArch64CPUArchs> List;
+-- 
+2.43.0
+
diff --git a/0028-backport-Clang-Fix-crash-with-fzero-call-used-regs.patch b/0028-backport-Clang-Fix-crash-with-fzero-call-used-regs.patch
new file mode 100644
index 0000000..8e31369
--- /dev/null
+++ b/0028-backport-Clang-Fix-crash-with-fzero-call-used-regs.patch
@@ -0,0 +1,72 @@
+From 28e3fc80336935bc8bed372e78616ef5be9f4908 Mon Sep 17 00:00:00 2001
+From: Arthur Eubanks <aeubanks@google.com>
+Date: Thu, 27 Jul 2023 13:27:58 -0700
+Subject: [PATCH] Don't zero out noreg operands
+
+A tail call may have $noreg operands.
+
+Fixes a crash.
+
+Reviewed By: xgupta
+
+Differential Revision: https://reviews.llvm.org/D156485
+---
+ llvm/lib/CodeGen/PrologEpilogInserter.cpp    |  9 +++++++--
+ llvm/test/CodeGen/X86/zero-call-used-regs.ll | 14 ++++++++++++++
+ 2 files changed, 21 insertions(+), 2 deletions(-)
+
+diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+index e323aaaeefaf..49047719fdaa 100644
+--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
++++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+@@ -1285,6 +1285,8 @@ void PEI::insertZeroCallUsedRegs(MachineFunction &MF) {
+           continue;
+ 
+         MCRegister Reg = MO.getReg();
++        if (!Reg)
++          continue;
+ 
+         // This picks up sibling registers (e.q. %al -> %ah).
+         for (MCRegUnit Unit : TRI.regunits(Reg))
+@@ -1308,8 +1310,11 @@ void PEI::insertZeroCallUsedRegs(MachineFunction &MF) {
+         if (!MO.isReg())
+           continue;
+ 
+-        for (const MCPhysReg &Reg :
+-             TRI.sub_and_superregs_inclusive(MO.getReg()))
++        MCRegister Reg = MO.getReg();
++        if (!Reg)
++          continue;
++
++        for (const MCPhysReg Reg : TRI.sub_and_superregs_inclusive(Reg))
+           RegsToZero.reset(Reg);
+       }
+     }
+diff --git a/llvm/test/CodeGen/X86/zero-call-used-regs.ll b/llvm/test/CodeGen/X86/zero-call-used-regs.ll
+index 63d51c916bb9..97ad5ce9c8cb 100644
+--- a/llvm/test/CodeGen/X86/zero-call-used-regs.ll
++++ b/llvm/test/CodeGen/X86/zero-call-used-regs.ll
+@@ -241,6 +241,20 @@ entry:
+   ret i32 %x
+ }
+ 
++define dso_local void @tailcall(ptr %p) local_unnamed_addr #0 "zero-call-used-regs"="used-gpr" {
++; I386-LABEL: tailcall:
++; I386:       # %bb.0:
++; I386-NEXT:    movl {{[0-9]+}}(%esp), %eax
++; I386-NEXT:    jmpl *(%eax) # TAILCALL
++;
++; X86-64-LABEL: tailcall:
++; X86-64:       # %bb.0:
++; X86-64-NEXT:    jmpq *(%rdi) # TAILCALL
++  %c = load ptr, ptr %p
++  tail call void %c()
++  ret void
++}
++
+ ; Don't emit zeroing registers in "main" function.
+ define dso_local i32 @main() local_unnamed_addr #1 {
+ ; I386-LABEL: main:
+-- 
+2.43.0
+
diff --git a/0029-SimplifyLibCalls-Merge-sqrt-into-the-power-of-exp-79.patch b/0029-SimplifyLibCalls-Merge-sqrt-into-the-power-of-exp-79.patch
new file mode 100644
index 0000000..edfab31
--- /dev/null
+++ b/0029-SimplifyLibCalls-Merge-sqrt-into-the-power-of-exp-79.patch
@@ -0,0 +1,246 @@
+From 60ff801d1ea96ab964039cc1ed42e1dca0a63d54 Mon Sep 17 00:00:00 2001
+From: Anton Sidorenko <anton.sidorenko@syntacore.com>
+Date: Tue, 6 Feb 2024 12:02:06 +0300
+Subject: [PATCH] [SimplifyLibCalls] Merge sqrt into the power of exp (#79146)
+
+Under fast-math flags it's possible to convert `sqrt(exp(X)) `into
+`exp(X * 0.5)`. I suppose that this transformation is always profitable.
+This is similar to the optimization existing in GCC.
+---
+ .../llvm/Transforms/Utils/SimplifyLibCalls.h  |   1 +
+ .../lib/Transforms/Utils/SimplifyLibCalls.cpp |  67 ++++++++++
+ llvm/test/Transforms/InstCombine/sqrt.ll      | 120 ++++++++++++++++++
+ 3 files changed, 188 insertions(+)
+
+diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+index eb10545ee149..1aad0b298845 100644
+--- a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
++++ b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+@@ -201,6 +201,7 @@ private:
+   Value *optimizeFMinFMax(CallInst *CI, IRBuilderBase &B);
+   Value *optimizeLog(CallInst *CI, IRBuilderBase &B);
+   Value *optimizeSqrt(CallInst *CI, IRBuilderBase &B);
++  Value *mergeSqrtToExp(CallInst *CI, IRBuilderBase &B);
+   Value *optimizeSinCosPi(CallInst *CI, bool IsSin, IRBuilderBase &B);
+   Value *optimizeTan(CallInst *CI, IRBuilderBase &B);
+   // Wrapper for all floating point library call optimizations
+diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+index 3ad97613fe7a..dd5bbdaaf6d3 100644
+--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
++++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+@@ -2539,6 +2539,70 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
+   return Ret;
+ }
+ 
++// sqrt(exp(X)) -> exp(X * 0.5)
++Value *LibCallSimplifier::mergeSqrtToExp(CallInst *CI, IRBuilderBase &B) {
++  if (!CI->hasAllowReassoc())
++    return nullptr;
++
++  Function *SqrtFn = CI->getCalledFunction();
++  CallInst *Arg = dyn_cast<CallInst>(CI->getArgOperand(0));
++  if (!Arg || !Arg->hasAllowReassoc() || !Arg->hasOneUse())
++    return nullptr;
++  Intrinsic::ID ArgID = Arg->getIntrinsicID();
++  LibFunc ArgLb = NotLibFunc;
++  TLI->getLibFunc(*Arg, ArgLb);
++
++  LibFunc SqrtLb, ExpLb, Exp2Lb, Exp10Lb;
++
++  if (TLI->getLibFunc(SqrtFn->getName(), SqrtLb))
++    switch (SqrtLb) {
++    case LibFunc_sqrtf:
++      ExpLb = LibFunc_expf;
++      Exp2Lb = LibFunc_exp2f;
++      Exp10Lb = LibFunc_exp10f;
++      break;
++    case LibFunc_sqrt:
++      ExpLb = LibFunc_exp;
++      Exp2Lb = LibFunc_exp2;
++      Exp10Lb = LibFunc_exp10;
++      break;
++    case LibFunc_sqrtl:
++      ExpLb = LibFunc_expl;
++      Exp2Lb = LibFunc_exp2l;
++      Exp10Lb = LibFunc_exp10l;
++      break;
++    default:
++      return nullptr;
++    }
++  else if (SqrtFn->getIntrinsicID() == Intrinsic::sqrt) {
++    if (CI->getType()->getScalarType()->isFloatTy()) {
++      ExpLb = LibFunc_expf;
++      Exp2Lb = LibFunc_exp2f;
++      Exp10Lb = LibFunc_exp10f;
++    } else if (CI->getType()->getScalarType()->isDoubleTy()) {
++      ExpLb = LibFunc_exp;
++      Exp2Lb = LibFunc_exp2;
++      Exp10Lb = LibFunc_exp10;
++    } else
++      return nullptr;
++  } else
++    return nullptr;
++
++  if (ArgLb != ExpLb && ArgLb != Exp2Lb && ArgLb != Exp10Lb &&
++      ArgID != Intrinsic::exp && ArgID != Intrinsic::exp2)
++    return nullptr;
++
++  IRBuilderBase::InsertPointGuard Guard(B);
++  B.SetInsertPoint(Arg);
++  auto *ExpOperand = Arg->getOperand(0);
++  auto *FMul =
++      B.CreateFMulFMF(ExpOperand, ConstantFP::get(ExpOperand->getType(), 0.5),
++                      CI, "merged.sqrt");
++
++  Arg->setOperand(0, FMul);
++  return Arg;
++}
++
+ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
+   Module *M = CI->getModule();
+   Function *Callee = CI->getCalledFunction();
+@@ -2551,6 +2615,9 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
+        Callee->getIntrinsicID() == Intrinsic::sqrt))
+     Ret = optimizeUnaryDoubleFP(CI, B, TLI, true);
+ 
++  if (Value *Opt = mergeSqrtToExp(CI, B))
++    return Opt;
++
+   if (!CI->isFast())
+     return Ret;
+ 
+diff --git a/llvm/test/Transforms/InstCombine/sqrt.ll b/llvm/test/Transforms/InstCombine/sqrt.ll
+index 004df3e30c72..f72fe5a6a581 100644
+--- a/llvm/test/Transforms/InstCombine/sqrt.ll
++++ b/llvm/test/Transforms/InstCombine/sqrt.ll
+@@ -88,7 +88,127 @@ define float @sqrt_call_fabs_f32(float %x) {
+   ret float %sqrt
+ }
+ 
++define double @sqrt_exp(double %x) {
++; CHECK-LABEL: @sqrt_exp(
++; CHECK-NEXT:    [[MERGED_SQRT:%.*]] = fmul reassoc double [[X:%.*]], 5.000000e-01
++; CHECK-NEXT:    [[E:%.*]] = call reassoc double @llvm.exp.f64(double [[MERGED_SQRT]])
++; CHECK-NEXT:    ret double [[E]]
++;
++  %e = call reassoc double @llvm.exp.f64(double %x)
++  %res = call reassoc double @llvm.sqrt.f64(double %e)
++  ret double %res
++}
++
++define double @sqrt_exp_2(double %x) {
++; CHECK-LABEL: @sqrt_exp_2(
++; CHECK-NEXT:    [[MERGED_SQRT:%.*]] = fmul reassoc double [[X:%.*]], 5.000000e-01
++; CHECK-NEXT:    [[E:%.*]] = call reassoc double @exp(double [[MERGED_SQRT]])
++; CHECK-NEXT:    ret double [[E]]
++;
++  %e = call reassoc double @exp(double %x)
++  %res = call reassoc double @sqrt(double %e)
++  ret double %res
++}
++
++define double @sqrt_exp2(double %x) {
++; CHECK-LABEL: @sqrt_exp2(
++; CHECK-NEXT:    [[MERGED_SQRT:%.*]] = fmul reassoc double [[X:%.*]], 5.000000e-01
++; CHECK-NEXT:    [[E:%.*]] = call reassoc double @exp2(double [[MERGED_SQRT]])
++; CHECK-NEXT:    ret double [[E]]
++;
++  %e = call reassoc double @exp2(double %x)
++  %res = call reassoc double @sqrt(double %e)
++  ret double %res
++}
++
++define double @sqrt_exp10(double %x) {
++; CHECK-LABEL: @sqrt_exp10(
++; CHECK-NEXT:    [[MERGED_SQRT:%.*]] = fmul reassoc double [[X:%.*]], 5.000000e-01
++; CHECK-NEXT:    [[E:%.*]] = call reassoc double @exp10(double [[MERGED_SQRT]])
++; CHECK-NEXT:    ret double [[E]]
++;
++  %e = call reassoc double @exp10(double %x)
++  %res = call reassoc double @sqrt(double %e)
++  ret double %res
++}
++
++; Negative test
++define double @sqrt_exp_nofast_1(double %x) {
++; CHECK-LABEL: @sqrt_exp_nofast_1(
++; CHECK-NEXT:    [[E:%.*]] = call double @llvm.exp.f64(double [[X:%.*]])
++; CHECK-NEXT:    [[RES:%.*]] = call reassoc double @llvm.sqrt.f64(double [[E]])
++; CHECK-NEXT:    ret double [[RES]]
++;
++  %e = call double @llvm.exp.f64(double %x)
++  %res = call reassoc double @llvm.sqrt.f64(double %e)
++  ret double %res
++}
++
++; Negative test
++define double @sqrt_exp_nofast_2(double %x) {
++; CHECK-LABEL: @sqrt_exp_nofast_2(
++; CHECK-NEXT:    [[E:%.*]] = call reassoc double @llvm.exp.f64(double [[X:%.*]])
++; CHECK-NEXT:    [[RES:%.*]] = call double @llvm.sqrt.f64(double [[E]])
++; CHECK-NEXT:    ret double [[RES]]
++;
++  %e = call reassoc double @llvm.exp.f64(double %x)
++  %res = call double @llvm.sqrt.f64(double %e)
++  ret double %res
++}
++
++define double @sqrt_exp_merge_constant(double %x, double %y) {
++; CHECK-LABEL: @sqrt_exp_merge_constant(
++; CHECK-NEXT:    [[MERGED_SQRT:%.*]] = fmul reassoc nsz double [[X:%.*]], 5.000000e+00
++; CHECK-NEXT:    [[E:%.*]] = call reassoc double @llvm.exp.f64(double [[MERGED_SQRT]])
++; CHECK-NEXT:    ret double [[E]]
++;
++  %mul = fmul reassoc nsz double %x, 10.0
++  %e = call reassoc double @llvm.exp.f64(double %mul)
++  %res = call reassoc nsz double @llvm.sqrt.f64(double %e)
++  ret double %res
++}
++
++define double @sqrt_exp_intr_and_libcall(double %x) {
++; CHECK-LABEL: @sqrt_exp_intr_and_libcall(
++; CHECK-NEXT:    [[MERGED_SQRT:%.*]] = fmul reassoc double [[X:%.*]], 5.000000e-01
++; CHECK-NEXT:    [[E:%.*]] = call reassoc double @exp(double [[MERGED_SQRT]])
++; CHECK-NEXT:    ret double [[E]]
++;
++  %e = call reassoc double @exp(double %x)
++  %res = call reassoc double @llvm.sqrt.f64(double %e)
++  ret double %res
++}
++
++define double @sqrt_exp_intr_and_libcall_2(double %x) {
++; CHECK-LABEL: @sqrt_exp_intr_and_libcall_2(
++; CHECK-NEXT:    [[MERGED_SQRT:%.*]] = fmul reassoc double [[X:%.*]], 5.000000e-01
++; CHECK-NEXT:    [[E:%.*]] = call reassoc double @llvm.exp.f64(double [[MERGED_SQRT]])
++; CHECK-NEXT:    ret double [[E]]
++;
++  %e = call reassoc double @llvm.exp.f64(double %x)
++  %res = call reassoc double @sqrt(double %e)
++  ret double %res
++}
++
++define <2 x float> @sqrt_exp_vec(<2 x float> %x) {
++; CHECK-LABEL: @sqrt_exp_vec(
++; CHECK-NEXT:    [[MERGED_SQRT:%.*]] = fmul reassoc <2 x float> [[X:%.*]], <float 5.000000e-01, float 5.000000e-01>
++; CHECK-NEXT:    [[E:%.*]] = call reassoc <2 x float> @llvm.exp.v2f32(<2 x float> [[MERGED_SQRT]])
++; CHECK-NEXT:    ret <2 x float> [[E]]
++;
++  %e = call reassoc <2 x float> @llvm.exp.v2f32(<2 x float> %x)
++  %res = call reassoc <2 x float> @llvm.sqrt.v2f32(<2 x float> %e)
++  ret <2 x float> %res
++}
++
+ declare i32 @foo(double)
+ declare double @sqrt(double) readnone
+ declare float @sqrtf(float)
+ declare float @llvm.fabs.f32(float)
++declare double @llvm.exp.f64(double)
++declare double @llvm.sqrt.f64(double)
++declare double @exp(double)
++declare double @exp2(double)
++declare double @exp10(double)
++declare <2 x float> @llvm.exp.v2f32(<2 x float>)
++declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
+-- 
+2.38.1.windows.1
+
diff --git a/0030-LICM-Solve-runtime-error-caused-by-the-signal-functi.patch b/0030-LICM-Solve-runtime-error-caused-by-the-signal-functi.patch
new file mode 100644
index 0000000..81f9f17
--- /dev/null
+++ b/0030-LICM-Solve-runtime-error-caused-by-the-signal-functi.patch
@@ -0,0 +1,187 @@
+From fdbf1bd9f1bdec32384eda47f419d895d11a1c50 Mon Sep 17 00:00:00 2001
+From: XingYuShuai <1150775134@qq.com>
+Date: Wed, 15 May 2024 14:42:27 +0800
+Subject: [PATCH] [LICM] Solve runtime error caused by the signal function.
+
+Using the option enable-signal to control whether to solve the
+runtime error caused by the signal function when lto is turned on.
+---
+ llvm/cmake/modules/HandleLLVMOptions.cmake    |  8 ++++
+ llvm/lib/Transforms/Scalar/LICM.cpp           | 47 +++++++++++++++++++
+ .../Transforms/LICM/signal-before-loop-2.ll   | 25 ++++++++++
+ .../Transforms/LICM/signal-before-loop.ll     | 25 ++++++++++
+ llvm/test/lit.site.cfg.py.in                  |  1 +
+ 5 files changed, 106 insertions(+)
+ create mode 100644 llvm/test/Transforms/LICM/signal-before-loop-2.ll
+ create mode 100644 llvm/test/Transforms/LICM/signal-before-loop.ll
+
+diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
+index b8e9dbe29d88..8be5d4ba52c2 100644
+--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
++++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
+@@ -120,6 +120,14 @@ else()
+   set(LLVM_ENABLE_AUTOTUNER 0)
+ endif()
+ 
++option(LLVM_BUILD_FOR_COMMON "" ON)
++if(LLVM_BUILD_FOR_COMMON)
++  set(LLVM_BUILD_FOR_COMMON 1)
++  add_definitions( -DBUILD_FOR_COMMON )
++else()
++  set(LLVM_BUILD_FOR_COMMON 0)
++endif()
++
+ if(LLVM_ENABLE_EXPENSIVE_CHECKS)
+   add_compile_definitions(EXPENSIVE_CHECKS)
+ 
+diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
+index f8fab03f151d..2feec759f240 100644
+--- a/llvm/lib/Transforms/Scalar/LICM.cpp
++++ b/llvm/lib/Transforms/Scalar/LICM.cpp
+@@ -44,6 +44,9 @@
+ #include "llvm/Analysis/AliasSetTracker.h"
+ #include "llvm/Analysis/AssumptionCache.h"
+ #include "llvm/Analysis/CaptureTracking.h"
++#ifdef BUILD_FOR_COMMON
++#include "llvm/Analysis/CFG.h"
++#endif // BUILD_FOR_COMMON
+ #include "llvm/Analysis/GuardUtils.h"
+ #include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+ #include "llvm/Analysis/Loads.h"
+@@ -122,6 +125,13 @@ static cl::opt<bool>
+     SingleThread("licm-force-thread-model-single", cl::Hidden, cl::init(false),
+                  cl::desc("Force thread model single in LICM pass"));
+ 
++#ifdef BUILD_FOR_COMMON
++static cl::opt<bool> DisableMovStoreInsOutsideOfLoopInSigFun(
++  "disable-move-store-ins-outside-of-loop",
++    cl::Hidden, cl::init(true), cl::desc("Disable move store instruction"
++    "outside of loop in signal function."));
++#endif // BUILD_FOR_COMMON
++
+ static cl::opt<uint32_t> MaxNumUsesTraversed(
+     "licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
+     cl::desc("Max num uses visited for identifying load "
+@@ -2075,8 +2085,45 @@ bool llvm::promoteLoopAccessesToScalars(
+     for (Use &U : ASIV->uses()) {
+       // Ignore instructions that are outside the loop.
+       Instruction *UI = dyn_cast<Instruction>(U.getUser());
++ #if defined(BUILD_FOR_COMMON)
++      if (DisableMovStoreInsOutsideOfLoopInSigFun) {
++        if (!UI)
++          continue;
++
++        // In the following scenario, there will be a loop index store 
++        // instruction that is moved outside the loop and when the termination 
++        // loop is triggered by the signal function, the store instruction is not 
++        // executed.However, the function registered by the signal will read the
++        // data sored in the store instruction, so the data read is incorrect.
++        // Solution: Prevent the store instruction form going outside the loop.
++        // NOTE: The sys_signal function takes the same arguments and performs 
++        // the same task as signal. They all belong to glic.
++        if(StoreSafety == StoreSafe && !CurLoop->contains(UI)) {
++          if(LoadInst *NotCurLoopLoad = dyn_cast<LoadInst>(UI)) {
++            Function *NotCurLoopFun = UI->getParent()->getParent();
++            for (Use &UseFun : NotCurLoopFun->uses()) {
++            CallInst *Call = dyn_cast<CallInst>(UseFun.getUser());
++            if (Call && Call->getCalledFunction() &&
++                (Call->getCalledFunction()->getName() == "__sysv_signal" ||
++                 Call->getCalledFunction()->getName() == "signal") &&
++                isPotentiallyReachable(Call->getParent(),
++                                       CurLoop->getLoopPreheader(),NULL,DT,
++                                       LI))
++              return false;
++            }
++          }
++        }
++
++        if (!CurLoop->contains(UI))
++          continue;
++      } else {
++        if (!UI || !CurLoop->contains(UI))
++          continue;
++      }
++#else
+       if (!UI || !CurLoop->contains(UI))
+         continue;
++#endif // BUILD_FOR_COMMON
+ 
+       // If there is an non-load/store instruction in the loop, we can't promote
+       // it.
+diff --git a/llvm/test/Transforms/LICM/signal-before-loop-2.ll b/llvm/test/Transforms/LICM/signal-before-loop-2.ll
+new file mode 100644
+index 000000000000..da878c6c691b
+--- /dev/null
++++ b/llvm/test/Transforms/LICM/signal-before-loop-2.ll
+@@ -0,0 +1,25 @@
++; REQUIRES: enable_build_for_common
++; RUN:opt -disable-move-store-ins-outside-of-loop=true -S < %s | FileCheck %s 
++
++@Run_Index = external global i64
++
++declare ptr @signal(ptr)
++
++define void @report() {
++entry:
++  %0 = load i64, ptr @Run_Index, align 8
++  unreachable
++}
++
++define i32 @main() {
++if.end:
++  %call.i4 = call ptr @signal(ptr @report)
++  br label %for.cond
++
++; CHECK-LABEL: for.cond
++; CHECK: store
++for.cond:
++  %0 = load i64, ptr @Run_Index, align 8
++  store i64 %0, ptr @Run_Index, align 8
++  br label %for.cond
++}
+diff --git a/llvm/test/Transforms/LICM/signal-before-loop.ll b/llvm/test/Transforms/LICM/signal-before-loop.ll
+new file mode 100644
+index 000000000000..cfae4e87db56
+--- /dev/null
++++ b/llvm/test/Transforms/LICM/signal-before-loop.ll
+@@ -0,0 +1,25 @@
++; REQUIRES: enable_build_for_common
++; RUN:opt -disable-move-store-ins-outside-of-loop=true -S < %s | FileCheck %s 
++
++@Run_Index = external global i64
++
++declare ptr @__sysv_signal(ptr)
++
++define void @report() {
++entry:
++  %0 = load i64, ptr @Run_Index, align 8
++  unreachable
++}
++
++define i32 @main() {
++if.end:
++  %call.i4 = call ptr @__sysv_signal(ptr @report)
++  br label %for.cond
++
++; CHECK-LABEL: for.cond
++; CHECK: store
++for.cond:
++  %0 = load i64, ptr @Run_Index, align 8
++  store i64 %0, ptr @Run_Index, align 8
++  br label %for.cond
++}
+diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
+index 0e9396e3b014..20c1ecca1d43 100644
+--- a/llvm/test/lit.site.cfg.py.in
++++ b/llvm/test/lit.site.cfg.py.in
+@@ -63,6 +63,7 @@ config.dxil_tests = @LLVM_INCLUDE_DXIL_TESTS@
+ config.have_llvm_driver = @LLVM_TOOL_LLVM_DRIVER_BUILD@
+ config.use_classic_flang = @LLVM_ENABLE_CLASSIC_FLANG@
+ config.enable_enable_autotuner = @LLVM_ENABLE_AUTOTUNER@
++config.enable_build_for_common = @LLVM_BUILD_FOR_COMMON@
+ 
+ import lit.llvm
+ lit.llvm.initialize(lit_config, config)
+-- 
+2.38.1.windows.1
+
diff --git a/0031-ACPO-ACPO-Infrastructure.patch b/0031-ACPO-ACPO-Infrastructure.patch
new file mode 100644
index 0000000..12d6b00
--- /dev/null
+++ b/0031-ACPO-ACPO-Infrastructure.patch
@@ -0,0 +1,6173 @@
+From 9773a0bfd29580c31867afccada947457617628e Mon Sep 17 00:00:00 2001
+From: tsczajkowski <tsczajkowski@gmail.com>
+Date: Thu, 22 Aug 2024 23:56:11 -0400
+Subject: [PATCH] [ACPO] ACPO Infrastructure
+
+This change introduces ACPO ML infrastructure to enable use of ML models
+within LLVM compiler using a simple interface.
+---
+ ACPO_README.md                                |   36 +
+ llvm/CMakeLists.txt                           |    4 +
+ .../llvm/Analysis/ACPOCollectFeatures.h       |  296 ++++
+ llvm/include/llvm/Analysis/ACPOMLInterface.h  |  482 ++++++
+ llvm/include/llvm/Analysis/ACPOModel.h        |  122 ++
+ llvm/include/llvm/Analysis/ACPOModelRunner.h  |   39 +
+ llvm/include/llvm/Analysis/AOTModelRunner.h   |  203 +++
+ llvm/include/llvm/Analysis/CallHeight.h       |   72 +
+ llvm/include/llvm/Analysis/DumpCallsite.h     |   27 +
+ llvm/include/llvm/Analysis/DumpFeature.h      |  194 +++
+ llvm/include/llvm/Analysis/LoopInfo.h         |   11 +
+ .../llvm/Analysis/ModelDataCollector.h        |  108 ++
+ llvm/include/llvm/InitializePasses.h          |    8 +
+ llvm/lib/Analysis/ACPOCollectFeatures.cpp     | 1258 +++++++++++++++
+ llvm/lib/Analysis/ACPOMLInterface.cpp         | 1405 +++++++++++++++++
+ llvm/lib/Analysis/ACPOModel.cpp               |   63 +
+ llvm/lib/Analysis/CMakeLists.txt              |   32 +
+ llvm/lib/Analysis/CallHeight.cpp              |   89 ++
+ llvm/lib/Analysis/DumpCallsite.cpp            |   82 +
+ llvm/lib/Analysis/DumpFeature.cpp             |  575 +++++++
+ llvm/lib/Analysis/ModelDataCollector.cpp      |  350 ++++
+ llvm/lib/CodeGen/CMakeLists.txt               |    2 +-
+ llvm/lib/IR/AsmWriter.cpp                     |  220 ++-
+ llvm/lib/Passes/PassBuilder.cpp               |    6 +
+ llvm/lib/Passes/PassBuilderPipelines.cpp      |   14 +
+ llvm/lib/Passes/PassRegistry.def              |   10 +
+ 26 files changed, 5679 insertions(+), 29 deletions(-)
+ create mode 100644 ACPO_README.md
+ create mode 100644 llvm/include/llvm/Analysis/ACPOCollectFeatures.h
+ create mode 100644 llvm/include/llvm/Analysis/ACPOMLInterface.h
+ create mode 100644 llvm/include/llvm/Analysis/ACPOModel.h
+ create mode 100644 llvm/include/llvm/Analysis/ACPOModelRunner.h
+ create mode 100644 llvm/include/llvm/Analysis/AOTModelRunner.h
+ create mode 100644 llvm/include/llvm/Analysis/CallHeight.h
+ create mode 100644 llvm/include/llvm/Analysis/DumpCallsite.h
+ create mode 100644 llvm/include/llvm/Analysis/DumpFeature.h
+ create mode 100644 llvm/include/llvm/Analysis/ModelDataCollector.h
+ create mode 100644 llvm/lib/Analysis/ACPOCollectFeatures.cpp
+ create mode 100644 llvm/lib/Analysis/ACPOMLInterface.cpp
+ create mode 100644 llvm/lib/Analysis/ACPOModel.cpp
+ create mode 100644 llvm/lib/Analysis/CallHeight.cpp
+ create mode 100644 llvm/lib/Analysis/DumpCallsite.cpp
+ create mode 100644 llvm/lib/Analysis/DumpFeature.cpp
+ create mode 100644 llvm/lib/Analysis/ModelDataCollector.cpp
+
+diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
+index 79de9eb2e3e7..b0afb47a7243 100644
+--- a/llvm/CMakeLists.txt
++++ b/llvm/CMakeLists.txt
+@@ -1001,6 +1001,9 @@ endif()
+ #
+ set(TENSORFLOW_AOT_PATH "" CACHE PATH "Path to TensorFlow pip install dir")
+ 
++set(LLVM_INLINER_MODEL_PATH "" CACHE PATH "Path to the inliner model")
++set(ACPO_AOT OFF CACHE BOOL "Whether or not ACPO AOT is enabled")
++
+ if (NOT TENSORFLOW_AOT_PATH STREQUAL "")
+   set(LLVM_HAVE_TF_AOT "ON" CACHE BOOL "Tensorflow AOT available")
+   set(TENSORFLOW_AOT_COMPILER
+@@ -1009,6 +1012,7 @@ if (NOT TENSORFLOW_AOT_PATH STREQUAL "")
+   include_directories(${TENSORFLOW_AOT_PATH}/include)
+   add_subdirectory(${TENSORFLOW_AOT_PATH}/xla_aot_runtime_src
+     ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/tf_runtime)
++  target_compile_definitions(tf_xla_runtime_objects PUBLIC EIGEN_NEON_GEBP_NR=4) # Fix for issue https://github.com/tensorflow/tensorflow/issues/58481
+   install(TARGETS tf_xla_runtime EXPORT LLVMExports
+     ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT tf_xla_runtime)
+   set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS tf_xla_runtime)
+diff --git a/llvm/include/llvm/Analysis/ACPOCollectFeatures.h b/llvm/include/llvm/Analysis/ACPOCollectFeatures.h
+new file mode 100644
+index 000000000000..ec62b559542d
+--- /dev/null
++++ b/llvm/include/llvm/Analysis/ACPOCollectFeatures.h
+@@ -0,0 +1,296 @@
++//===- ACPOCollectFeatures.h - ACPO Class for Feature Collection ----------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++//
++// This header file defines the type, scope, and number of features to be
++// collected on a given ACPOModel class from all available features.
++//
++//===----------------------------------------------------------------------===//
++#ifndef LLVM_ANALYSIS_ACPOCOLLECTFEATURES_H
++#define LLVM_ANALYSIS_ACPOCOLLECTFEATURES_H
++#include "llvm/Analysis/InlineAdvisor.h"
++#include "llvm/Analysis/LoopInfo.h"
++#include "llvm/IR/Function.h"
++#include "llvm/IR/Instructions.h"
++#include "llvm/IR/PassManager.h"
++
++#include <ios>
++#include <memory>
++#include <ostream>
++#include <set>
++#include <sstream>
++#include <stdio.h>
++#include <stdlib.h>
++#include <unordered_map>
++#include <utility>
++#include <vector>
++
++namespace llvm {
++class ACPOCollectFeatures {
++public:
++  // A feature is related to one of the following scope
++  enum class Scope {
++    Module,
++    Function,
++    Loop,
++    Callgraph,
++    CallSite,
++    NumOfScope,
++  };
++
++  // In the future as more features are added, features can be calculated
++  // simultanously.
++  // Suppose feature A and B could be calculated in the same loop,
++  // then it would make sense to calculate both the features at the same time
++  // and save it in a cache system
++  // (which could be implemented similarly like Dumpfeatures.h/cpp).
++  enum class GroupID {
++    EdgeNodeCount,
++    FPIRelated,
++    HotColdCallSite,
++    InlineCostFeatureGroup,
++    ACPOFIExtendedFeatures,
++    NumOfGroupID
++  };
++
++  // List of features we support to be calculated.
++  // (1) For each feature there should be a corresponding scope on which it
++  // depends on for calculating.
++  // (2) A feature may belong in a group for which those features could be
++  //     calculated together.
++  // (3) Once you decided to add a feature you should register it to all the
++  //     static maps in the .cpp file. Except for some special indicator enum's
++  //     like InlineCostFeatureGroupBegin/End
++  enum class FeatureIndex {
++    // Begin: InlineCostFeatureGroup
++    InlineCostFeatureGroupBegin,
++    SROASavings,
++    SROALosses,
++    LoadElimination,
++    CallPenalty,
++    CallArgumentSetup,
++    LoadRelativeIntrinsic,
++    LoweredCallArgSetup,
++    IndirectCallPenalty,
++    JumpTablePenalty,
++    CaseClusterPenalty,
++    SwitchPenalty,
++    UnsimplifiedCommonInstructions,
++    NumLoops,
++    DeadBlocks,
++    SimplifiedInstructions,
++    ConstantArgs,
++    ConstantOffsetPtrArgs,
++    CallSiteCost,
++    ColdCcPenalty,
++    LastCallToStaticBonus,
++    IsMultipleBlocks,
++    NestedInlines,
++    NestedInlineCostEstimate,
++    Threshold,
++    InlineCostFeatureGroupEnd,
++    // End: InlineCostFeatureGroup
++
++    // Begin: FPIRelated
++    BasicBlockCount,
++    BlocksReachedFromConditionalInstruction,
++    Uses,
++    // End: FPIRelated
++
++    // Begin: EdgeNodeCount
++    EdgeCount,
++    NodeCount,
++    // End: EdgeNodeCount
++
++    // Begin: HotColdCallsite
++    ColdCallSite,
++    HotCallSite,
++    // End: HotColdCallsite
++
++    // Begin: ACPOFIExtendedFeatures
++    ACPOFIExtendedFeaturesNamedFeatureBegin,
++    ACPOFIExtendedFeaturesInitialSize,
++    ACPOFIExtendedFeaturesBlocks,
++    ACPOFIExtendedFeaturesCalls,
++    ACPOFIExtendedFeaturesIsLocal,
++    ACPOFIExtendedFeaturesIsLinkOnceODR,
++    ACPOFIExtendedFeaturesIsLinkOnce,
++    ACPOFIExtendedFeaturesLoops,
++    ACPOFIExtendedFeaturesMaxLoopDepth,
++    ACPOFIExtendedFeaturesMaxDomTreeLevel,
++    ACPOFIExtendedFeaturesPtrArgs,
++    ACPOFIExtendedFeaturesPtrCallee,
++    ACPOFIExtendedFeaturesCallReturnPtr,
++    ACPOFIExtendedFeaturesConditionalBranch,
++    ACPOFIExtendedFeaturesCBwithArg,
++    ACPOFIExtendedFeaturesCallerHeight,
++    ACPOFIExtendedFeaturesCallUsage,
++    ACPOFIExtendedFeaturesIsRecursive,
++    ACPOFIExtendedFeaturesNumCallsiteInLoop,
++    ACPOFIExtendedFeaturesNumOfCallUsesInLoop,
++    ACPOFIExtendedFeaturesEntryBlockFreq,
++    ACPOFIExtendedFeaturesMaxCallsiteBlockFreq,
++    ACPOFIExtendedFeaturesNamedFeatureEnd,
++    ACPOFIExtendedFeaturesFloatFeatureBegin,
++    ACPOFIExtendedFeaturesInstructionPerBlock,
++    ACPOFIExtendedFeaturesSuccessorPerBlock,
++    ACPOFIExtendedFeaturesAvgVecInstr,
++    ACPOFIExtendedFeaturesAvgNestedLoopLevel,
++    ACPOFIExtendedFeaturesInstrPerLoop,
++    ACPOFIExtendedFeaturesBlockWithMultipleSuccecorsPerLoop,
++    ACPOFIExtendedFeaturesFloatFeatureEnd,
++    // End: ACPOFIExtendedFeatures
++
++    CallerBlockFreq,
++    CallSiteHeight,
++    ConstantParam,
++    CostEstimate,
++    LoopLevel,
++    MandatoryKind,
++    MandatoryOnly,
++    OptCode,
++    IsIndirectCall,
++    IsInInnerLoop,
++    IsMustTailCall,
++    IsTailCall,
++    NumOfFeatures
++  };
++
++  struct AnalysisManagers {
++    FunctionAnalysisManager *FAM = nullptr;
++    ModuleAnalysisManager *MAM = nullptr;
++  };
++
++  // ScopeInfo is a struct that contains the correpsonding needed information to
++  // calculate the corresponding feature.
++  struct ScopeInfo {
++    Function *F = nullptr;
++    CallBase *CB = nullptr;
++    BasicBlock *BB = nullptr;
++    Module *M = nullptr;
++    Loop *L = nullptr;
++    // Can add Instructions or other types later.
++  };
++
++  struct OtherInfo {
++    bool MandatoryOnly = false;
++    InlineAdvisor *IA = nullptr;
++  };
++
++  // FeatureInfo should contain all the relevant information to calculate
++  // the corresponding FeatureIndex.
++  struct FeatureInfo {
++    // When Idx = NumOfFeatures. We assume this is a global FeatureInfo.
++    FeatureIndex Idx;
++    // Once we have the Idx we should know the following two attribute.
++    // Scope ScopeIdx //
++    // GroupID Group //
++    AnalysisManagers Managers;
++    ScopeInfo SI;
++    OtherInfo OI;
++  };
++
++  using FeatureValueMap = std::unordered_map<FeatureIndex, std::string>;
++  using FeatureInfoMap = std::unordered_map<FeatureIndex, FeatureInfo>;
++  using FeaturesInfo = std::vector<FeatureInfo>;
++  using Scopes = std::vector<Scope>;
++  using GroupIDs = std::vector<GroupID>;
++  typedef void (*CalculateFeatureFunction)(ACPOCollectFeatures &,
++                                           const FeatureInfo &);
++
++  // Constructors/Destructors
++  ACPOCollectFeatures();
++  ACPOCollectFeatures(FeatureInfo GlobalInfo);
++  ~ACPOCollectFeatures();
++
++  // Setters/getters
++  void setFeatureValue(FeatureIndex Idx, std::string Val);
++
++  void setFeatureInfo(FeatureIndex Idx, FeatureInfo Info);
++
++  void setFeatureValueAndInfo(FeatureIndex Idx, FeatureInfo Info,
++                              std::string Val);
++
++  void setGlobalFeatureInfo(FeatureInfo &Info);
++
++  std::string getFeature(FeatureIndex Idx) const;
++
++  // Check if the feature is alrady calculated.
++  bool containsFeature(FeatureIndex);
++  bool containsFeature(GroupID);
++
++  static std::string getFeatureName(FeatureIndex Idx);
++  static GroupID getFeatureGroup(FeatureIndex Idx);
++  static Scope getFeatureScope(FeatureIndex Idx);
++  static std::set<FeatureIndex> getGroupFeatures(GroupID Group);
++  static std::set<FeatureIndex> getScopeFeatures(Scope S);
++
++  void clearFeatureValueMap();
++  bool registeredFeature(FeatureIndex Idx) const;
++
++  // Calculate and Return the feature values specified by FeaturesInfo
++  FeatureValueMap getFeaturesPair(FeaturesInfo Features);
++
++  // Calculate and Return the feature values specified from [Beg, End)
++  // TODO: Make a similar method for Scopes and GroupIDs
++  FeatureValueMap getFeaturesPair(FeatureIndex Beg, FeatureIndex End);
++
++  // Calculate and Return the feature values specified by Scope.
++  FeatureValueMap getFeaturesPair(Scopes);
++
++  // Calculate and Return the feature values specified by GroupID.
++  FeatureValueMap getFeaturesPair(GroupIDs);
++
++  static InlineAdvisor::MandatoryInliningKind
++  getMandatoryKind(CallBase &CB, FunctionAnalysisManager &FAM,
++                   OptimizationRemarkEmitter &ORE);
++
++  static void clearFunctionLevel();
++  static void insertFunctionLevel(const Function *, unsigned);
++  static std::optional<unsigned> getFunctionLevel(const Function *);
++
++private:
++  // Global mappings.
++  // FeatureIndexToName and FeatureIndexToScope should be a one to one mapping.
++  static const std::unordered_map<FeatureIndex, std::string> FeatureIndexToName;
++  static const std::unordered_map<FeatureIndex, Scope> FeatureIndexToScope;
++  static const std::unordered_map<FeatureIndex, GroupID> FeatureIndexToGroup;
++  static const std::multimap<GroupID, FeatureIndex> GroupToFeatureIndices;
++  static const std::multimap<Scope, FeatureIndex> ScopeToFeatureIndices;
++  // The CalculateFeatureMap maps each feature to a corresponding function that
++  // calculates the feature and also sets the feature value inside
++  // FeatureValues field.
++  static const std::unordered_map<FeatureIndex, CalculateFeatureFunction>
++      CalculateFeatureMap;
++
++  // TODO:
++  // Implement the cache systems here. See similar example in DumpFeature.cpp
++  // Notice we've only cached the FunctionLevels.
++  // But in the future this should be generalized for all features.
++  // One way to do this is to define a map from FeatureIndex -> Mapping.
++  // Inside this mapping, the key should be the Scope and a set of analysis it
++  // depends on.
++
++  static std::map<const Function *, unsigned> FunctionLevels;
++
++  // Saved FeatureValues when we collect the features.
++  FeatureValueMap FeatureToValue;
++  FeatureInfoMap FeatureToInfo;
++  FeatureInfo GlobalFeatureInfo;
++};
++
++ACPOCollectFeatures::FeatureIndex operator+(ACPOCollectFeatures::FeatureIndex,
++                                            int);
++ACPOCollectFeatures::FeatureIndex operator-(ACPOCollectFeatures::FeatureIndex,
++                                            int);
++ACPOCollectFeatures::FeatureIndex &
++operator++(ACPOCollectFeatures::FeatureIndex &);
++ACPOCollectFeatures::FeatureIndex
++operator++(ACPOCollectFeatures::FeatureIndex &, int);
++
++} // namespace llvm
++#endif // LLVM_ANALYSIS_ACPOCOLLECTFEATURES_H
+diff --git a/llvm/include/llvm/Analysis/ACPOMLInterface.h b/llvm/include/llvm/Analysis/ACPOMLInterface.h
+new file mode 100644
+index 000000000000..996f27ee32ba
+--- /dev/null
++++ b/llvm/include/llvm/Analysis/ACPOMLInterface.h
+@@ -0,0 +1,482 @@
++//===- ACPOMLInterface.h - AI-Enabled Continuous Program Optimization -----===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++// Copyright (C) 2021-2022. Huawei Technologies Co., Ltd. All rights reserved.
++//
++//===----------------------------------------------------------------------===//
++
++#ifndef LLVM_ANALYSIS_ACPOML_INTERFACE_H
++#define LLVM_ANALYSIS_ACPOML_INTERFACE_H
++
++#include "llvm/Analysis/ACPOModelRunner.h"
++#include "llvm/IR/Constants.h"
++#include "llvm/IR/LLVMContext.h"
++#include "llvm/Support/Program.h"
++
++#include <cstddef>
++#include <ios>
++#include <memory>
++#include <sstream>
++#include <stdio.h>
++#include <stdlib.h>
++#include <unordered_map>
++#include <utility>
++#include <vector>
++
++namespace llvm {
++
++class ACPOModelRunner;
++
++// This is class for storing information about a model.
++class Model {
++public:
++  // Constructors
++  Model() : NumFeatures{1}, NumOutputs{1} {}
++  Model(std::size_t NumFeatures) : NumFeatures{NumFeatures}, NumOutputs{1} {}
++  Model(std::size_t NumFeatures, int NumOutputs)
++      : NumFeatures{NumFeatures}, NumOutputs{NumOutputs} {}
++
++  // Getters/Setters
++  std::size_t getNumFeatures() const { return NumFeatures; }
++  void setNumFeatures(int NumFeatures) { this->NumFeatures = NumFeatures; }
++
++  int getNumOutputs() const { return NumOutputs; }
++  void setNumOutputs(int NumOutputs) { this->NumOutputs = NumOutputs; }
++
++  std::string getSignature() const { return Signature; }
++  void setSignature(std::string Signature) { this->Signature = Signature; }
++
++  // Register a feature into the NametoID and IDToIndex maps.
++  bool registerFeature(std::string FeatureName, uint64_t FeatureID, int Index);
++
++  // Register an input into the map.
++  bool registerInput(std::string InputName, std::string InputType);
++
++  // Register an output into the map.
++  bool registerOutput(std::string OutputName, std::string OutputType);
++
++  // Return the index of a feature within the feature list used by inference().
++  int getIndex(uint64_t FeatureID) const;
++  int getIndex(std::string FeatureName) const;
++
++  // Return the name of a feature within the feature list used by inference().
++  std::string getName(uint64_t FeatureID) const;
++
++  // Return true if output name exists.
++  bool checkOutputExists(std::string OutputName) const;
++
++  // Return the type of an input given its name.
++  std::string getInputType(std::string OutputName) const;
++
++  // Return the type of an output given its name.
++  std::string getOutputType(std::string OutputName) const;
++
++private:
++  std::size_t NumFeatures;
++  int NumOutputs;
++  std::string Signature;
++  std::unordered_map<std::string, uint64_t> NameToID;
++  std::unordered_map<uint64_t, std::string> IDToName;
++  std::unordered_map<uint64_t, int> IDToIndex;
++
++  // A map from input name to input type
++  std::unordered_map<std::string, std::string> InputMap;
++
++  // A map from output name to output type
++  std::unordered_map<std::string, std::string> OutputMap;
++};
++
++// This is the base class to define an interface with an ML framework.
++class ACPOMLInterface {
++public:
++  // Constructor/Destructor.
++  ACPOMLInterface() {}
++  virtual ~ACPOMLInterface() {}
++
++  // Getters/Setters
++  bool isInitialized() const { return Initialized; }
++  void setInitialized(bool Val) { Initialized = Val; }
++
++  // Interface methods.
++  // Return the next available ID for a feature.
++  virtual uint64_t assignID() = 0;
++
++  // Load a model by reading from the specified file.
++  // Return false if the operation failed.
++  virtual bool loadModel(std::string ModelSpecFile) = 0;
++
++  // Insert a new model into the model map.
++  virtual bool registerModel(std::string ModelName, int NumFeatures) = 0;
++  virtual bool registerModel(std::string ModelName, int NumFeatures,
++                             int NumOutputs) = 0;
++
++  // Register a new feature for a given model.
++  virtual bool registerFeature(std::string ModelName, std::string FeatureName,
++                               int Index) = 0;
++
++  // Register a new output for a given model.
++  virtual bool registerOutput(std::string ModelName, std::string OutputName,
++                              std::string OutputType) = 0;
++
++  // Specify how many models are currently live in ML framework memory.
++  virtual int getNumLoadedModels() = 0;
++
++  // Specify the input file to use as IR to be passed to the model (however
++  // it is processed afterwards). Return false if the operation failed.
++  virtual bool defineInputIR(std::string Filename) = 0;
++
++  // Specify a custom feature for a model to use as input at the next model
++  // invocation. Return false if the operation failed.
++  virtual bool setCustomFeature(std::string ModelName, uint64_t FeatureID,
++                                int FeatureValue) = 0;
++  virtual bool setCustomFeature(std::string ModelName, uint64_t FeatureID,
++                                int64_t FeatureValue) = 0;
++  virtual bool setCustomFeature(std::string ModelName, uint64_t FeatureID,
++                                double FeatureValue) = 0;
++  virtual bool setCustomFeature(std::string ModelName, uint64_t FeatureID,
++                                float FeatureValue) = 0;
++  virtual bool setCustomFeature(std::string ModelName, uint64_t FeatureID,
++                                bool FeatureValue) = 0;
++  virtual bool setCustomFeature(std::string ModelName, std::string FeatureName,
++                                int FeatureValue) = 0;
++  virtual bool setCustomFeature(std::string ModelName, std::string FeatureName,
++                                int64_t FeatureValue) = 0;
++  virtual bool setCustomFeature(std::string ModelName, std::string FeatureName,
++                                double FeatureValue) = 0;
++  virtual bool setCustomFeature(std::string ModelName, std::string FeatureName,
++                                float FeatureValue) = 0;
++  virtual bool setCustomFeature(std::string ModelName, std::string FeatureName,
++                                bool FeatureValue) = 0;
++
++  // Replace all features with the given feature values.
++  // Activate the specified model.
++  virtual bool initializeFeatures(
++      std::string ModelName,
++      const std::vector<std::pair<uint64_t, std::string>> &FeatureValues) = 0;
++
++  virtual bool
++  initializeFeatures(std::string ModelName,
++                     const std::vector<std::pair<std::string, std::string>>
++                         &FeatureValues) = 0;
++
++  // Set features with the specified feature values.
++  // Not changing the currently active model.
++  virtual bool setCustomFeatures(
++      std::string ModelName,
++      const std::vector<std::pair<uint64_t, std::string>> &FeatureValues) = 0;
++
++  virtual bool
++  setCustomFeatures(std::string ModelName,
++                    const std::vector<std::pair<std::string, std::string>>
++                        &FeatureValues) = 0;
++
++  // Run a model with specified name. Return false if the execution was not
++  // possible or an error was encountered.
++  virtual bool runModel(std::string ModelName) = 0;
++
++  // Return the type of an output within the model specified by the name.
++  virtual std::string getOutputType(std::string ModelName,
++                                    std::string OutputName) = 0;
++
++  // Return model results, based on the output name.
++  virtual int getModelResultI(std::string OutputName) = 0;
++  virtual int64_t getModelResultI64(std::string OutputName) = 0;
++  virtual float getModelResultF(std::string OutputName) = 0;
++  virtual double getModelResultD(std::string OutputName) = 0;
++  virtual bool getModelResultB(std::string OutputName) = 0;
++
++  // Get status of the ML interface. Return zero if succeeded.
++  virtual int getStatus() = 0;
++
++  // Free up memory taken by a model.
++  virtual bool releaseModel(std::string ModelName) = 0;
++
++  // Close interface when done. Return false if the command was not successful.
++  // In some cases this just requires a constructor for this class to be called,
++  // but in others, additional steps that require feedback may be needed.
++  virtual bool closeMLInterface() = 0;
++
++  // Set a flag to invoke closeMLInterface when the instance of the class is
++  // destroyed.
++  void setCloseOnDestruction() { CloseOnDestruction = true; }
++
++protected:
++  bool CloseOnDestruction = false;
++
++private:
++  bool Initialized = false;
++};
++
++class ACPOMLPythonInterface : public ACPOMLInterface {
++public:
++  ACPOMLPythonInterface();
++  virtual ~ACPOMLPythonInterface();
++
++  // Interface methods.
++  // Return the next available ID for a feature.
++  virtual uint64_t assignID() override;
++
++  // Load a model by reading from the specified file.
++  // Return false if the operation failed.
++  virtual bool loadModel(std::string ModelSpecFile) override;
++
++  // Insert a new model into the model map.
++  virtual bool registerModel(std::string ModelName, int NumFeatures) override;
++  virtual bool registerModel(std::string ModelName, int NumFeatures,
++                             int NumOutputs) override;
++
++  // Register a new feature for a given model.
++  virtual bool registerFeature(std::string ModelName, std::string FeatureName,
++                               int Index) override;
++
++  // Register a new output for a given model.
++  virtual bool registerOutput(std::string ModelName, std::string OutputName,
++                              std::string OutputType) override;
++
++  // Specify how many models are currently live in ML framework memory.
++  virtual int getNumLoadedModels() override;
++
++  // Specify the input file to use as IR to be passed to the model (however
++  // it is processed afterwards). Return false if the operation failed.
++  virtual bool defineInputIR(std::string Filename) override;
++
++  // Specify a custom feature for a model to use as input at the next model
++  // invocation. Return false if the operation failed.
++  virtual bool setCustomFeature(std::string ModelName, uint64_t FeatureID,
++                                int FeatureValue) override;
++  virtual bool setCustomFeature(std::string ModelName, uint64_t FeatureID,
++                                int64_t FeatureValue) override;
++  virtual bool setCustomFeature(std::string ModelName, uint64_t FeatureID,
++                                double FeatureValue) override;
++  virtual bool setCustomFeature(std::string ModelName, uint64_t FeatureID,
++                                float FeatureValue) override;
++  virtual bool setCustomFeature(std::string ModelName, uint64_t FeatureID,
++                                bool FeatureValue) override;
++
++  virtual bool setCustomFeature(std::string ModelName, std::string FeatureName,
++                                int FeatureValue) override;
++  virtual bool setCustomFeature(std::string ModelName, std::string FeatureName,
++                                int64_t FeatureValue) override;
++  virtual bool setCustomFeature(std::string ModelName, std::string FeatureName,
++                                double FeatureValue) override;
++  virtual bool setCustomFeature(std::string ModelName, std::string FeatureName,
++                                float FeatureValue) override;
++  virtual bool setCustomFeature(std::string ModelName, std::string FeatureName,
++                                bool FeatureValue) override;
++
++  // Replace all features with the given feature values.
++  // Activate the specified model.
++  virtual bool
++  initializeFeatures(std::string ModelName,
++                     const std::vector<std::pair<uint64_t, std::string>>
++                         &FeatureValues) override;
++
++  virtual bool
++  initializeFeatures(std::string ModelName,
++                     const std::vector<std::pair<std::string, std::string>>
++                         &FeatureValues) override;
++
++  // Set features with the specified feature values.
++  // Not changing the currently active model.
++  virtual bool
++  setCustomFeatures(std::string ModelName,
++                    const std::vector<std::pair<uint64_t, std::string>>
++                        &FeatureValues) override;
++
++  virtual bool
++  setCustomFeatures(std::string ModelName,
++                    const std::vector<std::pair<std::string, std::string>>
++                        &FeatureValues) override;
++
++  // Run a model with the specified name. Return false if the execution was not
++  // possible or an error was encountered.
++  virtual bool runModel(std::string ModelName) override;
++
++  // Return the type of an output within the model specified by the name.
++  virtual std::string getOutputType(std::string ModelName,
++                                    std::string OutputName) override;
++
++  // Return model results, based on the output name.
++  virtual int getModelResultI(std::string OutputName) override;
++  virtual int64_t getModelResultI64(std::string OutputName) override;
++  virtual float getModelResultF(std::string OutputName) override;
++  virtual double getModelResultD(std::string OutputName) override;
++  virtual bool getModelResultB(std::string OutputName) override;
++
++  // Get status of the ML interface. Return zero if succeeded.
++  virtual int getStatus() override;
++
++  // Free up memory taken by a model.
++  virtual bool releaseModel(std::string ModelName) override;
++
++  // Close interface when done. Return false if the command was not successful.
++  // In some cases this just requires a constructor for this class to be called,
++  // but in others, additional steps that require feedback may be needed.
++  virtual bool closeMLInterface() override;
++
++protected:
++  void sendCommand(const std::string &Command);
++  void sendCommand(const std::vector<std::string> &Features);
++  std::string getResponse();
++  std::vector<std::string> tokenize(const std::string &Line);
++
++private:
++  llvm::sys::ProcessInfo SubProcess;
++  FILE *PipeOut = nullptr;
++  FILE *PipeIn = nullptr;
++
++  uint64_t NextID;
++
++  std::string CurrentlyActiveModel;
++
++  // Mapping model names to their corresponding Model
++  std::unordered_map<std::string, std::shared_ptr<Model>> ModelMap;
++};
++
++std::shared_ptr<ACPOMLInterface> createPersistentPythonMLIF();
++
++class ACPOMLCPPInterface : public ACPOMLInterface {
++public:
++  ACPOMLCPPInterface();
++  virtual ~ACPOMLCPPInterface();
++
++  // Interface methods.
++  // Return the next available ID for a feature.
++  virtual uint64_t assignID() override;
++
++  // Load a model by reading from the specified file.
++  // Return false if the operation failed.
++  // For ACPOMLCompiledInterface, loadCompiledModel() should be used instead.
++  virtual bool loadModel(std::string ModelSpecFile) override;
++
++  // Insert a new model into the model map.
++  virtual bool registerModel(std::string ModelName, int NumFeatures) override;
++  virtual bool registerModel(std::string ModelName, int NumFeatures,
++                             int NumOutputs) override;
++
++  // Register a new feature for a given model.
++  virtual bool registerFeature(std::string ModelName, std::string FeatureName,
++                               int Index) override;
++
++  // Register a new output for a given model.
++  virtual bool registerOutput(std::string ModelName, std::string OutputName,
++                              std::string OutputType) override;
++
++  // Specify how many models are currently live in ML framework memory.
++  virtual int getNumLoadedModels() override;
++
++  // Specify the input file to use as IR to be passed to the model (however
++  // it is processed afterwards). Return false if the operation failed.
++  virtual bool defineInputIR(std::string Filename) override;
++
++  // Specify a custom feature for a model to use as input at the next model
++  // invocation. Return false if the operation failed.
++  virtual bool setCustomFeature(std::string ModelName, uint64_t FeatureID,
++                                int FeatureValue) override;
++  virtual bool setCustomFeature(std::string ModelName, uint64_t FeatureID,
++                                int64_t FeatureValue) override;
++  virtual bool setCustomFeature(std::string ModelName, uint64_t FeatureID,
++                                double FeatureValue) override;
++  virtual bool setCustomFeature(std::string ModelName, uint64_t FeatureID,
++                                float FeatureValue) override;
++  virtual bool setCustomFeature(std::string ModelName, uint64_t FeatureID,
++                                bool FeatureValue) override;
++
++  virtual bool setCustomFeature(std::string ModelName, std::string FeatureName,
++                                int FeatureValue) override;
++  virtual bool setCustomFeature(std::string ModelName, std::string FeatureName,
++                                int64_t FeatureValue) override;
++  virtual bool setCustomFeature(std::string ModelName, std::string FeatureName,
++                                double FeatureValue) override;
++  virtual bool setCustomFeature(std::string ModelName, std::string FeatureName,
++                                float FeatureValue) override;
++  virtual bool setCustomFeature(std::string ModelName, std::string FeatureName,
++                                bool FeatureValue) override;
++
++  // Replace all features with the given feature values.
++  // Activate the specified model.
++  virtual bool
++  initializeFeatures(std::string ModelName,
++                     const std::vector<std::pair<uint64_t, std::string>>
++                         &FeatureValues) override;
++
++  virtual bool
++  initializeFeatures(std::string ModelName,
++                     const std::vector<std::pair<std::string, std::string>>
++                         &FeatureValues) override;
++
++  // Set features with the specified feature values.
++  // Not changing the currently active model.
++  virtual bool
++  setCustomFeatures(std::string ModelName,
++                    const std::vector<std::pair<uint64_t, std::string>>
++                        &FeatureValues) override;
++
++  virtual bool
++  setCustomFeatures(std::string ModelName,
++                    const std::vector<std::pair<std::string, std::string>>
++                        &FeatureValues) override;
++
++  // Run a model with the specified name. Return false if the execution was not
++  // possible or an error was encountered.
++  virtual bool runModel(std::string ModelName) override;
++
++  // Return the type of an input within the model specified by the name.
++  virtual std::string getInputType(std::string ModelName,
++                                   std::string InputName);
++
++  // Return the type of an output within the model specified by the name.
++  virtual std::string getOutputType(std::string ModelName,
++                                    std::string OutputName) override;
++
++  // Return model results, based on the output name.
++  virtual int getModelResultI(std::string OutputName) override;
++  virtual int64_t getModelResultI64(std::string OutputName) override;
++  virtual float getModelResultF(std::string OutputName) override;
++  virtual double getModelResultD(std::string OutputName) override;
++  virtual bool getModelResultB(std::string OutputName) override;
++
++  // Get status of the ML interface. Return zero if succeeded.
++  virtual int getStatus() override;
++
++  // Free up memory taken by a model.
++  virtual bool releaseModel(std::string ModelName) override;
++
++  // Close interface when done. Return false if the command was not successful.
++  // In some cases this just requires a constructor for this class to be called,
++  // but in others, additional steps that require feedback may be needed.
++  virtual bool closeMLInterface() override;
++
++private:
++  uint64_t NextID;
++
++  std::string CurrentlyActiveModel;
++
++  // Mapping model names to their corresponding Model
++  std::unordered_map<std::string, std::shared_ptr<Model>> ModelMap;
++
++  // Mapping model names to their corresponding Runner
++  std::unordered_map<std::string, std::shared_ptr<ACPOModelRunner>> RunnerMap;
++
++  std::string readModelParam(std::string FilePath, std::string Param);
++
++  void readFeatures(std::string FilePath,
++                    std::vector<std::pair<std::string, std::string>> &Features);
++  void readOutputs(std::string FilePath,
++                   std::vector<std::pair<std::string, std::string>> &Outputs);
++
++  typedef std::unique_ptr<ACPOModelRunner> (*CreateModelRunnerFunction)(
++      std::vector<std::pair<std::string, std::string>>,
++      StringRef); // function pointer type
++  const static std::unordered_map<std::string, CreateModelRunnerFunction>
++      CreateModelRunnerMap;
++};
++
++std::shared_ptr<ACPOMLInterface> createPersistentCompiledMLIF();
++
++} // namespace llvm
++
++#endif // LLVM_ANALYSIS_ACPOML_INTERFACE_H
+diff --git a/llvm/include/llvm/Analysis/ACPOModel.h b/llvm/include/llvm/Analysis/ACPOModel.h
+new file mode 100644
+index 000000000000..34dbc0fdb8bf
+--- /dev/null
++++ b/llvm/include/llvm/Analysis/ACPOModel.h
+@@ -0,0 +1,122 @@
++//===- ACPOModel.h - AI-Enabled Continuous Program Optimization -----------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++
++#ifndef LLVM_ANALYSIS_ACPOMODEL_H
++#define LLVM_ANALYSIS_ACPOMODEL_H
++
++#include "llvm/Analysis/ACPOMLInterface.h"
++#include "llvm/IR/Constants.h"
++#include "llvm/IR/Type.h"
++#include <map>
++#include <string>
++#include <tuple>
++#include <unordered_map>
++
++namespace llvm {
++
++class OptimizationRemarkEmitter;
++class LLVMContext;
++
++class ACPOAdvice {
++public:
++  struct FieldType {
++    Type::TypeID T;
++    Constant *Val;
++  };
++
++  ACPOAdvice() {}
++  ACPOAdvice(std::unique_ptr<ACPOAdvice> &ResultFormat);
++  virtual ~ACPOAdvice() {};
++
++  Constant *getField(std::string name) {
++    auto Search = FieldMap.find(name);
++    if (Search == FieldMap.end()) {
++      return nullptr;
++    }
++    return Search->second.Val;
++  }
++
++  void reserveField(std::string name, Type::TypeID &ID) {
++    FieldMap[name].T = ID;
++    FieldMap[name].Val = nullptr;
++  }
++
++  void addField(std::string name, Constant *Val) {
++    assert(Val != nullptr);
++    FieldMap[name].T = Val->getType()->getTypeID();
++    FieldMap[name].Val = Val;
++  }
++
++  std::unordered_map<std::string, struct FieldType> &getFieldMap() {
++    return FieldMap;
++  }
++
++private:
++  std::unordered_map<std::string, struct FieldType> FieldMap;
++};
++
++class ACPOModel {
++public:
++  ACPOModel(OptimizationRemarkEmitter *OptReEm, bool UseML = true) :
++      ORE(OptReEm), ShouldUseML(UseML) {
++    ResultFormat = std::make_unique<ACPOAdvice>();
++    assert(ResultFormat != nullptr);
++  }
++
++  ~ACPOModel() {}
++
++  bool isForcedToStop() const { return ForceStop; }
++
++  // This is a interface method to return result of estimation either via an ML
++  // model or by employing a heuristic. The ML version should be implemented in
++  // the getAdviceML, which can be overwritten when necessary. The non-ML
++  // version should be implemented in getAdviceNoML and that should always be
++  // overwritten (and it will be marked as pure (=0) to force the programmer
++  // to do so).
++  std::unique_ptr<ACPOAdvice> getAdvice();
++  void addRequiredResultField(std::string name, Type::TypeID &ID);
++
++  void setContextPtr(LLVMContext *C) { Context = C; }
++  LLVMContext *getContextPtr() { return Context; }
++
++  void setMLIF(std::shared_ptr<ACPOMLInterface> ML) { MLIF = ML; }
++  std::shared_ptr<ACPOMLInterface> getMLIF() { return MLIF; }
++
++protected:
++  void addFeature(int64_t ID, Constant *Val);
++  virtual void sendCustomFeatures() {}
++  virtual void prepareModelInput();
++  virtual bool runModel(std::unique_ptr<ACPOAdvice> &Result);
++
++  virtual std::unique_ptr<ACPOAdvice> getAdviceML();
++  virtual std::unique_ptr<ACPOAdvice> getAdviceNoML() = 0;
++
++private:
++  // Pointer to means of feedback propagation
++  OptimizationRemarkEmitter *ORE;
++
++  // We may need LLVMContext to set values of a Constant
++  LLVMContext *Context = nullptr;
++
++  // Specify expected format of the ACPOAdvice result.
++  std::unique_ptr<ACPOAdvice> ResultFormat = nullptr;
++
++  // Custom feature list.
++  std::unordered_map<uint64_t, Constant *> CustomFeatureMap;
++
++  // Interface to ML framework.
++  std::shared_ptr<ACPOMLInterface> MLIF = nullptr;
++
++  // Specify if ML infra is in use
++  bool ShouldUseML = false;
++  bool ForceStop = false;
++};
++
++} // namespace llvm
++
++#endif // LLVM_ANALYSIS_ACPOMODEL_H
+diff --git a/llvm/include/llvm/Analysis/ACPOModelRunner.h b/llvm/include/llvm/Analysis/ACPOModelRunner.h
+new file mode 100644
+index 000000000000..819e17f71103
+--- /dev/null
++++ b/llvm/include/llvm/Analysis/ACPOModelRunner.h
+@@ -0,0 +1,39 @@
++//===- ACPOModelRunner.h - AI-Enabled Continuous Program Optimization -----===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++
++#ifndef LLVM_ANALYSIS_ACPOMODEL_H
++#define LLVM_ANALYSIS_ACPOMODEL_H
++
++#include "llvm/Analysis/MLModelRunner.h"
++
++namespace llvm {
++
++class ACPOModelRunner : public MLModelRunner {
++public:
++  virtual bool setCustomFeature(int FeatureIndex, int FeatureValue) = 0;
++  virtual bool setCustomFeature(int FeatureIndex, int64_t FeatureValue) = 0;
++  virtual bool setCustomFeature(int FeatureIndex, double FeatureValue) = 0;
++  virtual bool setCustomFeature(int FeatureIndex, float FeatureValue) = 0;
++  virtual bool setCustomFeature(int FeatureIndex, bool FeatureValue) = 0;
++
++  virtual bool runModel() = 0;
++
++  virtual int getModelResultI(std::string OutputName) = 0;
++  virtual int64_t getModelResultI64(std::string OutputName) = 0;
++  virtual float getModelResultF(std::string OutputName) = 0;
++  virtual double getModelResultD(std::string OutputName) = 0;
++  virtual bool getModelResultB(std::string OutputName) = 0;
++
++protected:
++  ACPOModelRunner(LLVMContext &Ctx, size_t NrInputs)
++      : MLModelRunner(Ctx, MLModelRunner::Kind::Release, NrInputs) {}
++};
++
++} // namespace llvm
++
++#endif // LLVM_ANALYSIS_ACPOMODEL_H
+diff --git a/llvm/include/llvm/Analysis/AOTModelRunner.h b/llvm/include/llvm/Analysis/AOTModelRunner.h
+new file mode 100644
+index 000000000000..abc6258c4f09
+--- /dev/null
++++ b/llvm/include/llvm/Analysis/AOTModelRunner.h
+@@ -0,0 +1,203 @@
++//===- AOTModelRunner.h - AI-Enabled Continuous Program Optimization ------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++
++#ifndef LLVM_ANALYSIS_AOTMODEL_H
++#define LLVM_ANALYSIS_AOTMODEL_H
++
++#include "llvm/Analysis/ACPOModelRunner.h"
++#include "llvm/Analysis/TensorSpec.h"
++
++#define DEBUG_TYPE "acpo-aot"
++
++namespace llvm {
++
++template <class TGen> class AOTModelRunner : public ACPOModelRunner {
++public:
++  /// FeatureNames' type should be an indexed collection of std::string, like
++  /// std::array or std::vector, that has a size() method.
++  /// In the future, this method could be expanded to allow AOT models with
++  /// multiple outputs, by taking in a vector of string pairs similar to the
++  /// Features vector.
++  /// The current implementation does work for AOT models with a single output
++  /// which is a vector (or higher-dimensional tensor) of multiple values.
++  AOTModelRunner(
++      LLVMContext &Ctx,
++      const std::vector<std::pair<std::string, std::string>> &Features,
++      StringRef DecisionName, StringRef FeedPrefix = "feed_",
++      StringRef FetchPrefix = "fetch_")
++      : ACPOModelRunner(Ctx, Features.size()),
++        CompiledModel(std::make_unique<TGen>()) {
++    assert(CompiledModel && "The CompiledModel should be valid");
++
++    for (size_t I = 0; I < Features.size(); ++I) {
++      const int Index =
++          CompiledModel->LookupArgIndex(FeedPrefix.str() + Features[I].first);
++      void *Buffer = nullptr;
++      if (Index >= 0) {
++        Buffer = CompiledModel->arg_data(Index);
++      } else {
++        LLVM_DEBUG(dbgs() << "Warning: AOTModelRunner was unable to find the "
++                             "feature "
++                          << (FeedPrefix.str() + Features[I].first)
++                          << " in the compiled model\n");
++      }
++      // The order of features passed to the model runner is important, it
++      // determines their index
++      TensorSpec Spec = makeSpec(Features[I].first, Features[I].second);
++      setUpBufferForTensor(I, Spec, Buffer);
++    }
++
++    ResultIndex = CompiledModel->LookupResultIndex(FetchPrefix.str() +
++                                                   DecisionName.str());
++    assert(ResultIndex >= 0 && "Cannot find DecisionName in inlining model");
++  }
++
++  virtual ~AOTModelRunner() = default;
++
++  static bool classof(const ACPOModelRunner *R) {
++    return R->getKind() == ACPOModelRunner::Kind::Release;
++  }
++
++  bool setCustomFeature(int FeatureIndex, int FeatureValue) override {
++    LLVM_DEBUG(dbgs() << "AOTModelRunner: setting int feature " << FeatureIndex
++                      << " to " << FeatureValue << "\n");
++    *getTensor<int>(FeatureIndex) = FeatureValue;
++    return true;
++  }
++  bool setCustomFeature(int FeatureIndex, int64_t FeatureValue) override {
++    LLVM_DEBUG(dbgs() << "AOTModelRunner: setting int64 feature "
++                      << FeatureIndex << " to " << FeatureValue << "\n");
++    *getTensor<int64_t>(FeatureIndex) = FeatureValue;
++    return true;
++  }
++  bool setCustomFeature(int FeatureIndex, double FeatureValue) override {
++    LLVM_DEBUG(dbgs() << "AOTModelRunner: setting double feature "
++                      << FeatureIndex << " to " << FeatureValue << "\n");
++    *getTensor<double>(FeatureIndex) = FeatureValue;
++    return true;
++  }
++  bool setCustomFeature(int FeatureIndex, float FeatureValue) override {
++    LLVM_DEBUG(dbgs() << "AOTModelRunner: setting float feature "
++                      << FeatureIndex << " to " << FeatureValue << "\n");
++    *getTensor<float>(FeatureIndex) = FeatureValue;
++    return true;
++  }
++  bool setCustomFeature(int FeatureIndex, bool FeatureValue) override {
++    // There are no bool tensors, so assume int for now
++    LLVM_DEBUG(dbgs() << "AOTModelRunner: setting bool feature " << FeatureIndex
++                      << " to " << FeatureValue << "\n");
++    *getTensor<int>(FeatureIndex) = FeatureValue;
++    return true;
++  }
++
++  bool runModel() override {
++    CompiledModel->Run();
++    return true;
++  }
++
++  int getModelResultI(std::string OutputName) override {
++    void *Data = CompiledModel->result_data(ResultIndex);
++    int Result = *reinterpret_cast<int *>(Data);
++    LLVM_DEBUG(dbgs() << "Returning int model result " << OutputName << " = "
++                      << Result << "\n");
++    return Result;
++  }
++
++  int64_t getModelResultI64(std::string OutputName) override {
++    void *Data = CompiledModel->result_data(ResultIndex);
++    int64_t Result = *reinterpret_cast<int64_t *>(Data);
++    LLVM_DEBUG(dbgs() << "Returning int64 model result " << OutputName << " = "
++                      << Result << "\n");
++    return Result;
++  }
++
++  float getModelResultF(std::string OutputName) override {
++    void *Data = CompiledModel->result_data(ResultIndex);
++    float Result = *reinterpret_cast<float *>(Data);
++    LLVM_DEBUG(dbgs() << "Returning float model result " << OutputName << " = "
++                      << Result << "\n");
++    return Result;
++  }
++
++  double getModelResultD(std::string OutputName) override {
++    void *Data = CompiledModel->result_data(ResultIndex);
++    double Result = *reinterpret_cast<double *>(Data);
++    LLVM_DEBUG(dbgs() << "Returning double model result " << OutputName << " = "
++                      << Result << "\n");
++    return Result;
++  }
++
++  bool getModelResultB(std::string OutputName) override {
++    // Since there are no bool tensors, use int and return the corresponding
++    // result
++    void *Data = CompiledModel->result_data(ResultIndex);
++    bool Result = (*reinterpret_cast<int *>(Data)) > 0;
++    LLVM_DEBUG(dbgs() << "Returning bool model result " << OutputName << " = "
++                      << Result << "\n");
++    return Result;
++  }
++
++protected:
++  std::unique_ptr<TGen> CompiledModel;
++
++private:
++  void *evaluateUntyped() override {
++    CompiledModel->Run();
++    return CompiledModel->result_data(ResultIndex);
++  }
++
++  llvm::TensorSpec makeSpec(std::string Name, std::string Type) {
++    std::vector<int64_t> Shape{};
++    // If the string is of the form "float32[7][8]", read the value in brackets
++    // as the shape (read from left to right)
++    size_t RightBracket = 0;
++    size_t LeftBracket = 0;
++    do {
++      LeftBracket = Type.find("[", RightBracket + 1);
++      if (LeftBracket == std::string::npos) {
++        break;
++      }
++      RightBracket = Type.find("]", LeftBracket + 1);
++      size_t Value = std::stol(
++          Type.substr(LeftBracket + 1, RightBracket - LeftBracket - 1));
++      Shape.push_back(Value);
++    } while (RightBracket != std::string::npos);
++
++    // Remove array indices to just get type
++    if (Type.find("[") != std::string::npos) {
++      Type = Type.substr(0, Type.find("["));
++    }
++
++    if (Shape.size() == 0)
++      Shape.push_back(1); // Default shape is {1}
++
++    if (Type == "int64") {
++      return TensorSpec::createSpec<int64_t>(Name, Shape);
++    }
++    if (Type == "int32") {
++      return TensorSpec::createSpec<int32_t>(Name, Shape);
++    }
++    if (Type == "int" || Type == "bool") {
++      // There are no bool tensors, so assume int for now
++      return TensorSpec::createSpec<int>(Name, Shape);
++    }
++    if (Type == "float64") {
++      return TensorSpec::createSpec<double>(Name, Shape);
++    }
++    if (Type == "float32") {
++      return TensorSpec::createSpec<float>(Name, Shape);
++    }
++    assert(false && "ACPO AOT: received unknown feature type");
++  }
++
++  int32_t ResultIndex = -1;
++};
++
++} // namespace llvm
++
++#endif // LLVM_ANALYSIS_AOTMODEL_H
+diff --git a/llvm/include/llvm/Analysis/CallHeight.h b/llvm/include/llvm/Analysis/CallHeight.h
+new file mode 100644
+index 000000000000..c1251081f525
+--- /dev/null
++++ b/llvm/include/llvm/Analysis/CallHeight.h
+@@ -0,0 +1,72 @@
++//===- CallHeight.h - Call height for function ------------------*- C++ -*-===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++//
++// This header file defines passes to get the call height of functions.
++//
++//===----------------------------------------------------------------------===//
++
++#ifndef LLVM_ANALYSIS_CALLHEIGHT
++#define LLVM_ANALYSIS_CALLHEIGHT
++
++#include "llvm/IR/Module.h"
++#include "llvm/IR/PassManager.h"
++#include "llvm/Pass.h"
++
++#include <unordered_map>
++#include <map>
++
++namespace llvm {
++
++class CallHeight {
++private:
++  /// Map from function to its level (callheight)
++  std::unique_ptr<std::map<const Function *, unsigned>> Levels;
++
++public:
++  CallHeight(Module &M);
++
++  // Change this to getHeight
++  unsigned getLevel(Function &F);
++
++  bool invalidate(Module &, const PreservedAnalyses &PA,
++                  ModuleAnalysisManager::Invalidator &) {
++    return false;
++  }
++};
++
++/// This analysis computes the mapping from function to level (callheight)
++/// for MLInliner
++class CallHeightAnalysis : public AnalysisInfoMixin<CallHeightAnalysis> {
++public:
++  static AnalysisKey Key;
++  using Result = CallHeight;
++
++  Result run(Module &M, ModuleAnalysisManager &MAM);
++};
++
++/// Legacy wrapper pass to provide the CallHeightAnalysis object.
++class CallHeightAnalysisWrapper : public ModulePass {
++  std::unique_ptr<llvm::CallHeight> Result;
++
++public:
++  static char ID;
++
++  CallHeightAnalysisWrapper() : ModulePass(ID) {}
++
++  bool runOnModule(Module &M) override;
++
++  llvm::CallHeight &getResult() { return *Result; }
++  const llvm::CallHeight &getResult() const { return *Result; }
++  void getAnalysisUsage(AnalysisUsage &AU) const override;
++};
++
++Pass *createCallHeightAnalysisWrapper();
++
++} // namespace llvm
++
++#endif
+diff --git a/llvm/include/llvm/Analysis/DumpCallsite.h b/llvm/include/llvm/Analysis/DumpCallsite.h
+new file mode 100644
+index 000000000000..9f80fe1cb985
+--- /dev/null
++++ b/llvm/include/llvm/Analysis/DumpCallsite.h
+@@ -0,0 +1,27 @@
++//===- DumpCallSite.h - Dump information about a callsite -------*- C++ -*-===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++//
++// This header file defines the pass used to dump a callsite.
++//
++//===----------------------------------------------------------------------===//
++
++#ifndef LLVM_ANALYSIS_DUMPCALLSITE
++#define LLVM_ANALYSIS_DUMPCALLSITE
++
++#include "llvm/IR/PassManager.h"
++
++namespace llvm {
++
++class DumpCallsitePass : public PassInfoMixin<DumpCallsitePass> {
++public:
++  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
++};
++
++} // namespace llvm
++
++#endif
+diff --git a/llvm/include/llvm/Analysis/DumpFeature.h b/llvm/include/llvm/Analysis/DumpFeature.h
+new file mode 100644
+index 000000000000..226e06cf5600
+--- /dev/null
++++ b/llvm/include/llvm/Analysis/DumpFeature.h
+@@ -0,0 +1,194 @@
++//===- DumpFeature.h - Dump features for a function -------------*- C++ -*-===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++//
++// This header file defines passes to dump features for functions in an scc.
++//
++//===----------------------------------------------------------------------===//
++
++#ifndef LLVM_ANALYSIS_DUMPFEATURE
++#define LLVM_ANALYSIS_DUMPFEATURE
++
++#include "llvm/Analysis/BlockFrequencyInfo.h"
++#include "llvm/Analysis/CGSCCPassManager.h"
++#include "llvm/Analysis/CallGraph.h"
++#include "llvm/Analysis/CallGraphSCCPass.h"
++#include "llvm/Analysis/LoopInfo.h"
++#include "llvm/Analysis/TargetTransformInfo.h"
++#include "llvm/IR/Dominators.h"
++#include "llvm/IR/PassManager.h"
++#include "llvm/Pass.h"
++
++#include <map>
++
++// EnableFeatureDump - This boolean is set to true if '-enable-feature-dump' is
++// used as command line option. And we dump function features.
++extern bool EnableFeatureDump;
++
++namespace llvm {
++
++class DumpFeaturePass : public PassInfoMixin<DumpFeaturePass> {
++public:
++  PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM,
++                        LazyCallGraph &CG, CGSCCUpdateResult &UR);
++
++private:
++  /// Get the caller height from cache or calculate from scratch
++  /// for a specific function F
++  int getCallHeight(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM,
++                    LazyCallGraph &CG, Function *F);
++};
++
++class ACPOFIExtendedFeatures {
++public:
++  enum class NamedFeatureIndex : size_t {
++    InitialSize,
++    Blocks,
++    Calls,
++    IsLocal,
++    IsLinkOnceODR,
++    IsLinkOnce,
++    Loops,
++    MaxLoopDepth,
++    MaxDomTreeLevel,
++    PtrArgs,
++    PtrCallee,
++    CallReturnPtr,
++    ConditionalBranch,
++    CBwithArg,
++    CallerHeight,
++    CallUsage,
++    IsRecursive,
++    NumCallsiteInLoop,
++    NumOfCallUsesInLoop,
++    EntryBlockFreq,
++    MaxCallsiteBlockFreq,
++    NumNamedFeatures
++  };
++
++  enum class NamedFloatFeatureIndex : size_t {
++    InstructionPerBlock,
++    SuccessorPerBlock,
++    AvgVecInstr,
++    AvgNestedLoopLevel,
++    InstrPerLoop,
++    BlockWithMultipleSuccecorsPerLoop,
++    NumNamedFloatFeatures
++  };
++
++  struct FunctionFeatures {
++    static const size_t FeatureCount;
++
++    std::array<uint64_t,
++               static_cast<size_t>(NamedFeatureIndex::NumNamedFeatures)>
++        NamedFeatures = {{0}};
++    std::array<float, static_cast<size_t>(
++                          NamedFloatFeatureIndex::NumNamedFloatFeatures)>
++        NamedFloatFeatures = {{0}};
++    std::vector<int32_t> InstructionHistogram;
++    std::vector<int32_t> InstructionPairHistogram;
++
++    void fillTensor(int32_t *Ptr) const;
++    uint64_t &operator[](NamedFeatureIndex Pos) {
++      return NamedFeatures[static_cast<size_t>(Pos)];
++    }
++    float &operator[](NamedFloatFeatureIndex Pos) {
++      return NamedFloatFeatures[static_cast<size_t>(Pos)];
++    }
++  };
++
++  ACPOFIExtendedFeatures() = default;
++
++  // Collect a number of features from the function F
++  static FunctionFeatures getFunctionFeatures(
++      Function &F, DominatorTree &DomTree, TargetTransformInfo &TTI,
++      LoopInfo &LI, FunctionAnalysisManager *FAM = nullptr, bool ValidSize = false,
++      bool ValidLoop = false, bool ValidTree = false);
++
++private:
++  // Loop related features, will update FF
++  static void updateLoopRelatedFeatures(Function &F, LoopInfo &LI,
++                                        FunctionFeatures &FF);
++  // Instruction and BasicBlock related features, will update FF
++  static void updateInstBBRelatedFeatures(Function &F, FunctionFeatures &FF);
++
++  // This function should mimic the behaviour of updating all features below at
++  // once:
++  //    getMaxCallsiteBlockFreq
++  //    updateCallsiteRelatedFeatures
++  //    updateInstBBRelatedFeatures
++  static void
++  updateBBLoopCallsiteBFFeatures(Function &F, FunctionFeatures &FF,
++                                 LoopInfo &LI,
++                                 FunctionAnalysisManager *FAM = nullptr);
++};
++
++const std::map<ACPOFIExtendedFeatures::NamedFeatureIndex, std::string>
++    NamedFeatureIndexToName = {
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::InitialSize, "InitialSize"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::Blocks, "Blocks"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::Calls, "Calls"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::IsLocal, "IsLocal"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::IsLinkOnceODR,
++         "IsLinkOnceODR"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::IsLinkOnce, "IsLinkOnce"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::Loops, "Loops"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::MaxLoopDepth,
++         "MaxLoopDepth"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::MaxDomTreeLevel,
++         "MaxDomTreeLevel"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::PtrArgs, "PtrArgs"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::PtrCallee, "PtrCallee"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::CallReturnPtr,
++         "CallReturnPtr"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::ConditionalBranch,
++         "ConditionalBranch"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::CBwithArg, "CBwithArg"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::CallerHeight,
++         "CallerHeight"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::CallUsage, "CallUsage"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::IsRecursive, "IsRecursive"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::NumCallsiteInLoop,
++         "NumCallsiteInLoop"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::NumOfCallUsesInLoop,
++         "NumOfCallUsesInLoop"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::EntryBlockFreq,
++         "EntryBlockFreq"},
++        {ACPOFIExtendedFeatures::NamedFeatureIndex::MaxCallsiteBlockFreq,
++         "MaxCallsiteBlockFreq"}};
++
++const std::map<ACPOFIExtendedFeatures::NamedFloatFeatureIndex, std::string>
++    FloatFeatureIndexToName = {
++        {ACPOFIExtendedFeatures::NamedFloatFeatureIndex::InstructionPerBlock,
++         "InstructionPerBlock"},
++        {ACPOFIExtendedFeatures::NamedFloatFeatureIndex::SuccessorPerBlock,
++         "SuccessorPerBlock"},
++        {ACPOFIExtendedFeatures::NamedFloatFeatureIndex::AvgVecInstr,
++         "AvgVecInstr"},
++        {ACPOFIExtendedFeatures::NamedFloatFeatureIndex::AvgNestedLoopLevel,
++         "AvgNestedLoopLevel"},
++        {ACPOFIExtendedFeatures::NamedFloatFeatureIndex::InstrPerLoop,
++         "InstrPerLoop"},
++        {ACPOFIExtendedFeatures::NamedFloatFeatureIndex::
++             BlockWithMultipleSuccecorsPerLoop,
++         "BlockWithMultipleSuccecorsPerLoop"}};
++
++ACPOFIExtendedFeatures::NamedFeatureIndex &
++operator++(ACPOFIExtendedFeatures::NamedFeatureIndex &n);
++
++ACPOFIExtendedFeatures::NamedFeatureIndex
++operator++(ACPOFIExtendedFeatures::NamedFeatureIndex &n, int);
++
++ACPOFIExtendedFeatures::NamedFloatFeatureIndex &
++operator++(ACPOFIExtendedFeatures::NamedFloatFeatureIndex &n);
++
++ACPOFIExtendedFeatures::NamedFloatFeatureIndex
++operator++(ACPOFIExtendedFeatures::NamedFloatFeatureIndex &n, int);
++
++} // namespace llvm
++
++#endif
+diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
+index 9be3e056cf76..ea4cb7f7c684 100644
+--- a/llvm/include/llvm/Analysis/LoopInfo.h
++++ b/llvm/include/llvm/Analysis/LoopInfo.h
+@@ -386,6 +386,17 @@ public:
+   void dump() const;
+   void dumpVerbose() const;
+ 
++#if defined(ENABLE_ACPO)
++  /// Print loop IR wrapped in a dummy function
++  void printWithFunctionWrapper(raw_ostream &ROS, Function *F,
++                                ArrayRef<BasicBlock *> LoopBlocks,
++                                BasicBlock *Header,
++                                SmallVector<BasicBlock *, 8> ExitBlocks,
++                                AssemblyAnnotationWriter *AAW,
++                                bool ShouldPreserveUseListOrder,
++                                bool IsForDebug) const;
++#endif
++
+   /// Return the debug location of the start of this loop.
+   /// This looks for a BB terminating instruction with a known debug
+   /// location by looking at the preheader and header blocks. If it
+diff --git a/llvm/include/llvm/Analysis/ModelDataCollector.h b/llvm/include/llvm/Analysis/ModelDataCollector.h
+new file mode 100644
+index 000000000000..ad3fc476a9b2
+--- /dev/null
++++ b/llvm/include/llvm/Analysis/ModelDataCollector.h
+@@ -0,0 +1,108 @@
++//===- ModelDataCollector.h - Data collector for ML model -----------------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++
++#ifndef LLVM_ANALYSIS_MODELDATACOLLECTOR_H
++#define LLVM_ANALYSIS_MODELDATACOLLECTOR_H
++
++#if defined(ENABLE_ACPO)
++#include "llvm/ADT/StringMap.h"
++#include "llvm/Analysis/ACPOCollectFeatures.h"
++#include "llvm/Analysis/LoopInfo.h"
++#include "llvm/Support/FormattedStream.h"
++#include "llvm/Support/raw_ostream.h"
++#include <string>
++#include <vector>
++
++namespace llvm {
++class ModelDataCollector {
++public:
++  enum DumpOption { function, loop, before, after };
++
++  ModelDataCollector(formatted_raw_ostream &OS, std::string OutputFileName = "")
++      : OutputFileName(OutputFileName), Out(OS) {}
++
++  ~ModelDataCollector() {}
++
++  std::string getDumpOptionAsString(DumpOption DO);
++  std::string getIRFileName(StringRef Key);
++  std::string getOutputFileName();
++  bool isEmptyOutputFile();
++  //std::string generateIRFileName(autotuning::CodeRegion CR);
++  std::string demangleName(const std::string &Name);
++  std::vector<std::pair<std::string, std::string>> getFeatures();
++  std::unique_ptr<raw_ostream>
++  createFile(const Twine &FilePath, const Twine &FileName, std::error_code &EC);
++  StringMap<std::string> getIRFileNameMap();
++  void
++  setFeatures(std::vector<std::pair<std::string, std::string>> NewFeatures);
++  void setIRFileNameMap(StringMap<std::string> IRFileNameMap);
++  void
++  addFeatures(std::vector<std::pair<std::string, std::string>> NewFeatures);
++
++  // Print out the features
++  void printRow(bool printHeader = false);
++
++  // Create the directory structure and store IR files in their corresponding
++  // directory
++  void writeIR(Loop *L, Function *F, std::string NewIRFileName,
++               std::string PassName, DumpOption DumpBeforeOrAfter,
++               bool PrintLoop, bool PrintFunction,
++               bool OverwriteIRFile = false);
++
++  // Print the loop IR to a file
++  void createIRFileForLoop(Loop *L, const Twine &IRFilePath,
++                           const Twine &NewIRFileName, bool OverwriteIRFile);
++
++  // Print the function IR to a file
++  void createIRFileForFunction(Function *F, const Twine &IRFilePath,
++                               const Twine &NewIRFileName,
++                               bool OverwriteIRFile);
++
++  virtual void collectFeatures(Loop *L, const std::string &ModuleName,
++                               const std::string &FuncName,
++                               const std::string &LoopName);
++
++  virtual void collectFeatures();
++
++  // FeatureCollectInfo contains the information of registered feature.
++  struct FeatureCollectInfo {
++    std::unique_ptr<ACPOCollectFeatures::FeaturesInfo> FeaturesInfo;
++    std::unique_ptr<ACPOCollectFeatures::Scopes> RegisteredScopes;
++    std::unique_ptr<ACPOCollectFeatures::GroupIDs> RegisteredGroupIDs;
++    std::unique_ptr<ACPOCollectFeatures::FeatureInfo> GlobalInfo;
++    std::unique_ptr<ACPOCollectFeatures> FeatureCollector;
++    std::string Prefix;
++    std::string Postfix;
++  };
++
++  void registerFeature(ACPOCollectFeatures::FeaturesInfo, std::string = "",
++                       std::string = "");
++  void registerFeature(ACPOCollectFeatures::Scopes,
++                       ACPOCollectFeatures::FeatureInfo, std::string = "",
++                       std::string = "");
++  void registerFeature(ACPOCollectFeatures::GroupIDs,
++                       ACPOCollectFeatures::FeatureInfo, std::string = "",
++                       std::string = "");
++  void resetRegisteredFeatures();
++
++protected:
++  // Collected features
++  std::vector<std::pair<std::string, std::string>> Features;
++  // NOTE: OutputFileName being empty (null) is treated as stdout
++  std::string OutputFileName;
++  std::vector<std::unique_ptr<FeatureCollectInfo>> FeatureCollectInfos;
++
++private:
++  // Stream for dumping training data
++  formatted_raw_ostream &Out;
++  StringMap<std::string> IRFileNames;
++};
++} // namespace llvm
++
++#endif // ENABLE_ACPO
++#endif // LLVM_ANALYSIS_MODELDATACOLLECTOR_H
+diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
+index 80bec2d82e24..7fdb5db67c16 100644
+--- a/llvm/include/llvm/InitializePasses.h
++++ b/llvm/include/llvm/InitializePasses.h
+@@ -100,6 +100,7 @@ void initializeDomPrinterWrapperPassPass(PassRegistry &);
+ void initializeDomViewerWrapperPassPass(PassRegistry &);
+ void initializeDominanceFrontierWrapperPassPass(PassRegistry&);
+ void initializeDominatorTreeWrapperPassPass(PassRegistry&);
++void initializeDumpCallsiteLegacyPass(PassRegistry &);
+ void initializeDwarfEHPrepareLegacyPassPass(PassRegistry &);
+ void initializeEarlyCSELegacyPassPass(PassRegistry&);
+ void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry&);
+@@ -124,6 +125,7 @@ void initializeFixIrreduciblePass(PassRegistry &);
+ void initializeFixupStatepointCallerSavedPass(PassRegistry&);
+ void initializeFlattenCFGLegacyPassPass(PassRegistry &);
+ void initializeFuncletLayoutPass(PassRegistry&);
++void initializeCallHeightAnalysisWrapperPass(PassRegistry &);
+ void initializeGCMachineCodeAnalysisPass(PassRegistry&);
+ void initializeGCModuleInfoPass(PassRegistry&);
+ void initializeGVNLegacyPassPass(PassRegistry&);
+@@ -132,6 +134,7 @@ void initializeGlobalsAAWrapperPassPass(PassRegistry&);
+ void initializeGuardWideningLegacyPassPass(PassRegistry&);
+ void initializeHardwareLoopsLegacyPass(PassRegistry&);
+ void initializeMIRProfileLoaderPassPass(PassRegistry &);
++void initializeInlineAdvisorAnalysisWrapperPass(PassRegistry &);
+ void initializeIRSimilarityIdentifierWrapperPassPass(PassRegistry&);
+ void initializeIRTranslatorPass(PassRegistry&);
+ void initializeIVUsersWrapperPassPass(PassRegistry&);
+@@ -149,6 +152,11 @@ void initializeInterleavedLoadCombinePass(PassRegistry &);
+ void initializeIntervalPartitionPass(PassRegistry&);
+ void initializeJMCInstrumenterPass(PassRegistry&);
+ void initializeKCFIPass(PassRegistry &);
++void initializeLegacyFAMPass(PassRegistry &);
++void initializeLegacyFunctionPropertiesAnalysisPass(PassRegistry &);
++void initializeLegacyInlinerPassPass(PassRegistry &);
++void initializeLegacyInlineSizeEstimatorAnalysisPass(PassRegistry &);
++void initializeLegacyModuleInlinerWrapperPassPass(PassRegistry &);
+ void initializeLCSSAVerificationPassPass(PassRegistry&);
+ void initializeLCSSAWrapperPassPass(PassRegistry&);
+ void initializeLazyBlockFrequencyInfoPassPass(PassRegistry&);
+diff --git a/llvm/lib/Analysis/ACPOCollectFeatures.cpp b/llvm/lib/Analysis/ACPOCollectFeatures.cpp
+new file mode 100644
+index 000000000000..f9de26483c76
+--- /dev/null
++++ b/llvm/lib/Analysis/ACPOCollectFeatures.cpp
+@@ -0,0 +1,1258 @@
++//===- ACPOCollectFeatures.cpp - ACPO Class for Feature Collection -------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++//
++// This file implements ACPOCollectFeatures class
++//
++//===----------------------------------------------------------------------===//
++
++#include "llvm/Analysis/ACPOCollectFeatures.h"
++#include "llvm/ADT/SCCIterator.h"
++// The ACPOFIModel.h currently contains only the cache system for
++// ACPOFIExtendedFeatures.
++#include "llvm/Analysis/ACPOFIModel.h"
++#include "llvm/Analysis/AssumptionCache.h"
++#include "llvm/Analysis/BlockFrequencyInfo.h"
++#include "llvm/Analysis/CallGraph.h"
++#include "llvm/Analysis/DumpFeature.h"
++#include "llvm/Analysis/FunctionPropertiesAnalysis.h"
++#include "llvm/Analysis/InlineAdvisor.h"
++#include "llvm/Analysis/InlineCost.h"
++#include "llvm/Analysis/OptimizationRemarkEmitter.h"
++#include "llvm/Analysis/TargetTransformInfo.h"
++#include "llvm/IR/Dominators.h"
++#include "llvm/IR/InstIterator.h"
++#include "llvm/IR/Instructions.h"
++#include "llvm/Support/Debug.h"
++
++#define DEBUG_TYPE "ACPOCollectFeatures"
++
++namespace llvm {
++
++// Helper function that is used to calculate features and each function should
++// registered in the CalculateFeatureMap.
++static void calculateFPIRelated(ACPOCollectFeatures &ACF,
++                                const ACPOCollectFeatures::FeatureInfo &info);
++static void
++calculateCallerBlockFreq(ACPOCollectFeatures &ACF,
++                         const ACPOCollectFeatures::FeatureInfo &info);
++static void
++calculateCallSiteHeight(ACPOCollectFeatures &ACF,
++                        const ACPOCollectFeatures::FeatureInfo &info);
++static void
++calculateConstantParam(ACPOCollectFeatures &ACF,
++                       const ACPOCollectFeatures::FeatureInfo &info);
++static void calculateCostEstimate(ACPOCollectFeatures &ACF,
++                                  const ACPOCollectFeatures::FeatureInfo &info);
++static void
++calculateEdgeNodeCount(ACPOCollectFeatures &ACF,
++                       const ACPOCollectFeatures::FeatureInfo &info);
++static void
++calculateHotColdCallSite(ACPOCollectFeatures &ACF,
++                         const ACPOCollectFeatures::FeatureInfo &info);
++static void calculateLoopLevel(ACPOCollectFeatures &ACF,
++                               const ACPOCollectFeatures::FeatureInfo &info);
++static void
++calculateMandatoryKind(ACPOCollectFeatures &ACF,
++                       const ACPOCollectFeatures::FeatureInfo &info);
++static void
++calculateMandatoryOnly(ACPOCollectFeatures &ACF,
++                       const ACPOCollectFeatures::FeatureInfo &info);
++static void
++calculateInlineCostFeatures(ACPOCollectFeatures &ACF,
++                            const ACPOCollectFeatures::FeatureInfo &info);
++static void calculateACPOFIExtendedFeaturesFeatures(
++    ACPOCollectFeatures &ACF, const ACPOCollectFeatures::FeatureInfo &info);
++static void
++calculateIsIndirectCall(ACPOCollectFeatures &ACF,
++                        const ACPOCollectFeatures::FeatureInfo &info);
++static void
++calculateIsInInnerLoop(ACPOCollectFeatures &ACF,
++                       const ACPOCollectFeatures::FeatureInfo &info);
++static void
++calculateIsMustTailCall(ACPOCollectFeatures &ACF,
++                        const ACPOCollectFeatures::FeatureInfo &info);
++static void calculateIsTailCall(ACPOCollectFeatures &ACF,
++                                const ACPOCollectFeatures::FeatureInfo &info);
++static void calculateOptCode(ACPOCollectFeatures &ACF,
++                             const ACPOCollectFeatures::FeatureInfo &info);
++
++// Register FeatureIdx -> Feature name
++//          FeatureIdx -> Scope, Scope -> FeatureIdx
++//          FeatureIdx -> Group, Group -> FeatureIdx
++//          FeatureIdx -> Calculating function
++#define REGISTER_NAME(INDEX_NAME, NAME)                                        \
++  { ACPOCollectFeatures::FeatureIndex::INDEX_NAME, NAME }
++const std::unordered_map<ACPOCollectFeatures::FeatureIndex, std::string>
++    ACPOCollectFeatures::FeatureIndexToName{
++        REGISTER_NAME(SROASavings, "sroa_savings"),
++        REGISTER_NAME(SROALosses, "sroa_losses"),
++        REGISTER_NAME(LoadElimination, "load_elimination"),
++        REGISTER_NAME(CallPenalty, "call_penalty"),
++        REGISTER_NAME(CallArgumentSetup, "call_argument_setup"),
++        REGISTER_NAME(LoadRelativeIntrinsic, "load_relative_intrinsic"),
++        REGISTER_NAME(LoweredCallArgSetup, "lowered_call_arg_setup"),
++        REGISTER_NAME(IndirectCallPenalty, "indirect_call_penalty"),
++        REGISTER_NAME(JumpTablePenalty, "jump_table_penalty"),
++        REGISTER_NAME(CaseClusterPenalty, "case_cluster_penalty"),
++        REGISTER_NAME(SwitchPenalty, "switch_penalty"),
++        REGISTER_NAME(UnsimplifiedCommonInstructions,
++                      "unsimplified_common_instructions"),
++        REGISTER_NAME(NumLoops, "num_loops"),
++        REGISTER_NAME(DeadBlocks, "dead_blocks"),
++        REGISTER_NAME(SimplifiedInstructions, "simplified_instructions"),
++        REGISTER_NAME(ConstantArgs, "constant_args"),
++        REGISTER_NAME(ConstantOffsetPtrArgs, "constant_offset_ptr_args"),
++        REGISTER_NAME(CallSiteCost, "callsite_cost"),
++        REGISTER_NAME(ColdCcPenalty, "cold_cc_penalty"),
++        REGISTER_NAME(LastCallToStaticBonus, "last_call_to_static_bonus"),
++        REGISTER_NAME(IsMultipleBlocks, "is_multiple_blocks"),
++        REGISTER_NAME(NestedInlines, "nested_inlines"),
++        REGISTER_NAME(NestedInlineCostEstimate, "nested_inline_cost_estimate"),
++        REGISTER_NAME(Threshold, "threshold"),
++        REGISTER_NAME(BasicBlockCount, "basic_block_count"),
++        REGISTER_NAME(BlocksReachedFromConditionalInstruction,
++                      "conditionally_executed_blocks"),
++        REGISTER_NAME(Uses, "users"),
++        REGISTER_NAME(EdgeCount, "edge_count"),
++        REGISTER_NAME(NodeCount, "node_count"),
++        REGISTER_NAME(ColdCallSite, "cold_callsite"),
++        REGISTER_NAME(HotCallSite, "hot_callsite"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesInitialSize, "InitialSize"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesBlocks, "Blocks"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesCalls, "Calls"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesIsLocal, "IsLocal"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesIsLinkOnceODR, "IsLinkOnceODR"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesIsLinkOnce, "IsLinkOnce"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesLoops, "Loops"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesMaxLoopDepth, "MaxLoopDepth"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesMaxDomTreeLevel, "MaxDomTreeLevel"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesPtrArgs, "PtrArgs"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesPtrCallee, "PtrCallee"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesCallReturnPtr, "CallReturnPtr"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesConditionalBranch,
++                      "ConditionalBranch"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesCBwithArg, "CBwithArg"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesCallerHeight, "CallerHeight"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesCallUsage, "CallUsage"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesIsRecursive, "IsRecursive"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesNumCallsiteInLoop,
++                      "NumCallsiteInLoop"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesNumOfCallUsesInLoop,
++                      "NumOfCallUsesInLoop"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesEntryBlockFreq, "EntryBlockFreq"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesMaxCallsiteBlockFreq,
++                      "MaxCallsiteBlockFreq"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesInstructionPerBlock,
++                      "InstructionPerBlock"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesSuccessorPerBlock,
++                      "SuccessorPerBlock"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesAvgVecInstr, "AvgVecInstr"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesAvgNestedLoopLevel,
++                      "AvgNestedLoopLevel"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesInstrPerLoop, "InstrPerLoop"),
++        REGISTER_NAME(ACPOFIExtendedFeaturesBlockWithMultipleSuccecorsPerLoop,
++                      "BlockWithMultipleSuccecorsPerLoop"),
++        REGISTER_NAME(CallerBlockFreq, "block_freq"),
++        REGISTER_NAME(CallSiteHeight, "callsite_height"),
++        REGISTER_NAME(ConstantParam, "nr_ctant_params"),
++        REGISTER_NAME(CostEstimate, "cost_estimate"),
++        REGISTER_NAME(LoopLevel, "loop_level"),
++        REGISTER_NAME(MandatoryKind, "mandatory_kind"),
++        REGISTER_NAME(MandatoryOnly, "mandatory_only"),
++        REGISTER_NAME(OptCode, "opt_code"),
++        REGISTER_NAME(IsIndirectCall, "is_indirect"),
++        REGISTER_NAME(IsInInnerLoop, "is_in_inner_loop"),
++        REGISTER_NAME(IsMustTailCall, "is_must_tail"),
++        REGISTER_NAME(IsTailCall, "is_tail"),
++        REGISTER_NAME(NumOfFeatures,"num_features"),
++    };
++#undef REGISTER_NAME
++
++#define REGISTER_SCOPE(INDEX_NAME, NAME)                                       \
++  {                                                                            \
++    ACPOCollectFeatures::FeatureIndex::INDEX_NAME,                             \
++        ACPOCollectFeatures::Scope::NAME                                       \
++  }
++const std::unordered_map<ACPOCollectFeatures::FeatureIndex,
++                         ACPOCollectFeatures::Scope>
++    ACPOCollectFeatures::FeatureIndexToScope{
++        REGISTER_SCOPE(SROASavings, CallSite),
++        REGISTER_SCOPE(SROALosses, CallSite),
++        REGISTER_SCOPE(LoadElimination, CallSite),
++        REGISTER_SCOPE(CallPenalty, CallSite),
++        REGISTER_SCOPE(CallArgumentSetup, CallSite),
++        REGISTER_SCOPE(LoadRelativeIntrinsic, CallSite),
++        REGISTER_SCOPE(LoweredCallArgSetup, CallSite),
++        REGISTER_SCOPE(IndirectCallPenalty, CallSite),
++        REGISTER_SCOPE(JumpTablePenalty, CallSite),
++        REGISTER_SCOPE(CaseClusterPenalty, CallSite),
++        REGISTER_SCOPE(SwitchPenalty, CallSite),
++        REGISTER_SCOPE(UnsimplifiedCommonInstructions, CallSite),
++        REGISTER_SCOPE(NumLoops, CallSite),
++        REGISTER_SCOPE(DeadBlocks, CallSite),
++        REGISTER_SCOPE(SimplifiedInstructions, CallSite),
++        REGISTER_SCOPE(ConstantArgs, CallSite),
++        REGISTER_SCOPE(ConstantOffsetPtrArgs, CallSite),
++        REGISTER_SCOPE(CallSiteCost, CallSite),
++        REGISTER_SCOPE(ColdCcPenalty, CallSite),
++        REGISTER_SCOPE(LastCallToStaticBonus, CallSite),
++        REGISTER_SCOPE(IsMultipleBlocks, CallSite),
++        REGISTER_SCOPE(NestedInlines, CallSite),
++        REGISTER_SCOPE(NestedInlineCostEstimate, CallSite),
++        REGISTER_SCOPE(Threshold, CallSite),
++        REGISTER_SCOPE(BasicBlockCount, Function),
++        REGISTER_SCOPE(BlocksReachedFromConditionalInstruction, Function),
++        REGISTER_SCOPE(Uses, Function),
++        REGISTER_SCOPE(EdgeCount, Module),
++        REGISTER_SCOPE(NodeCount, Module),
++        REGISTER_SCOPE(ColdCallSite, CallSite),
++        REGISTER_SCOPE(HotCallSite, CallSite),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesInitialSize, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesBlocks, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesCalls, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesIsLocal, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesIsLinkOnceODR, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesIsLinkOnce, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesLoops, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesMaxLoopDepth, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesMaxDomTreeLevel, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesPtrArgs, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesPtrCallee, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesCallReturnPtr, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesConditionalBranch, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesCBwithArg, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesCallerHeight, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesCallUsage, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesIsRecursive, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesNumCallsiteInLoop, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesNumOfCallUsesInLoop, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesEntryBlockFreq, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesMaxCallsiteBlockFreq, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesInstructionPerBlock, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesSuccessorPerBlock, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesAvgVecInstr, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesAvgNestedLoopLevel, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesInstrPerLoop, Function),
++        REGISTER_SCOPE(ACPOFIExtendedFeaturesBlockWithMultipleSuccecorsPerLoop,
++                       Function),
++        REGISTER_SCOPE(CallerBlockFreq, CallSite),
++        REGISTER_SCOPE(CallSiteHeight, CallSite),
++        REGISTER_SCOPE(ConstantParam, CallSite),
++        REGISTER_SCOPE(CostEstimate, CallSite),
++        REGISTER_SCOPE(LoopLevel, CallSite),
++        REGISTER_SCOPE(MandatoryKind, CallSite),
++        REGISTER_SCOPE(MandatoryOnly, CallSite),
++        REGISTER_SCOPE(OptCode, CallSite),
++        REGISTER_SCOPE(IsIndirectCall, CallSite),
++        REGISTER_SCOPE(IsInInnerLoop, CallSite),
++        REGISTER_SCOPE(IsMustTailCall, CallSite),
++        REGISTER_SCOPE(IsTailCall, CallSite),
++    };
++#undef REGISTER_SCOPE
++
++#define REGISTER_GROUP(INDEX_NAME, NAME)                                       \
++  {                                                                            \
++    ACPOCollectFeatures::FeatureIndex::INDEX_NAME,                             \
++        ACPOCollectFeatures::GroupID::NAME                                     \
++  }
++const std::unordered_map<ACPOCollectFeatures::FeatureIndex,
++                         ACPOCollectFeatures::GroupID>
++    ACPOCollectFeatures::FeatureIndexToGroup{
++        REGISTER_GROUP(SROASavings, InlineCostFeatureGroup),
++        REGISTER_GROUP(SROALosses, InlineCostFeatureGroup),
++        REGISTER_GROUP(LoadElimination, InlineCostFeatureGroup),
++        REGISTER_GROUP(CallPenalty, InlineCostFeatureGroup),
++        REGISTER_GROUP(CallArgumentSetup, InlineCostFeatureGroup),
++        REGISTER_GROUP(LoadRelativeIntrinsic, InlineCostFeatureGroup),
++        REGISTER_GROUP(LoweredCallArgSetup, InlineCostFeatureGroup),
++        REGISTER_GROUP(IndirectCallPenalty, InlineCostFeatureGroup),
++        REGISTER_GROUP(JumpTablePenalty, InlineCostFeatureGroup),
++        REGISTER_GROUP(CaseClusterPenalty, InlineCostFeatureGroup),
++        REGISTER_GROUP(SwitchPenalty, InlineCostFeatureGroup),
++        REGISTER_GROUP(UnsimplifiedCommonInstructions, InlineCostFeatureGroup),
++        REGISTER_GROUP(NumLoops, InlineCostFeatureGroup),
++        REGISTER_GROUP(DeadBlocks, InlineCostFeatureGroup),
++        REGISTER_GROUP(SimplifiedInstructions, InlineCostFeatureGroup),
++        REGISTER_GROUP(ConstantArgs, InlineCostFeatureGroup),
++        REGISTER_GROUP(ConstantOffsetPtrArgs, InlineCostFeatureGroup),
++        REGISTER_GROUP(CallSiteCost, InlineCostFeatureGroup),
++        REGISTER_GROUP(ColdCcPenalty, InlineCostFeatureGroup),
++        REGISTER_GROUP(LastCallToStaticBonus, InlineCostFeatureGroup),
++        REGISTER_GROUP(IsMultipleBlocks, InlineCostFeatureGroup),
++        REGISTER_GROUP(NestedInlines, InlineCostFeatureGroup),
++        REGISTER_GROUP(NestedInlineCostEstimate, InlineCostFeatureGroup),
++        REGISTER_GROUP(Threshold, InlineCostFeatureGroup),
++        REGISTER_GROUP(BasicBlockCount, FPIRelated),
++        REGISTER_GROUP(BlocksReachedFromConditionalInstruction, FPIRelated),
++        REGISTER_GROUP(Uses, FPIRelated),
++        REGISTER_GROUP(EdgeCount, EdgeNodeCount),
++        REGISTER_GROUP(NodeCount, EdgeNodeCount),
++        REGISTER_GROUP(ColdCallSite, HotColdCallSite),
++        REGISTER_GROUP(HotCallSite, HotColdCallSite),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesInitialSize,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesBlocks, ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesCalls, ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesIsLocal, ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesIsLinkOnceODR,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesIsLinkOnce,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesLoops, ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesMaxLoopDepth,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesMaxDomTreeLevel,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesPtrArgs, ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesPtrCallee, ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesCallReturnPtr,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesConditionalBranch,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesCBwithArg, ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesCallerHeight,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesCallUsage, ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesIsRecursive,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesNumCallsiteInLoop,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesNumOfCallUsesInLoop,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesEntryBlockFreq,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesMaxCallsiteBlockFreq,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesInstructionPerBlock,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesSuccessorPerBlock,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesAvgVecInstr,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesAvgNestedLoopLevel,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesInstrPerLoop,
++                       ACPOFIExtendedFeatures),
++        REGISTER_GROUP(ACPOFIExtendedFeaturesBlockWithMultipleSuccecorsPerLoop,
++                       ACPOFIExtendedFeatures),
++    };
++#undef REGISTER_GROUP
++
++// Given a map that may not be one to one. Returns the inverse mapping.
++// EX: Input:  A -> 1, B -> 1
++//     Output: 1 -> A, 1 -> B
++template <class K, class V>
++static std::multimap<K, V> inverseMap(std::unordered_map<V, K> Map) {
++  std::multimap<K, V> InverseMap;
++  for (const auto &It : Map) {
++    InverseMap.insert(std::pair<K, V>(It.second, It.first));
++  }
++  return InverseMap;
++}
++
++const std::multimap<ACPOCollectFeatures::GroupID,
++                    ACPOCollectFeatures::FeatureIndex>
++    ACPOCollectFeatures::GroupToFeatureIndices{
++        inverseMap<ACPOCollectFeatures::GroupID,
++                   ACPOCollectFeatures::FeatureIndex>(FeatureIndexToGroup)};
++
++const std::multimap<ACPOCollectFeatures::Scope,
++                    ACPOCollectFeatures::FeatureIndex>
++    ACPOCollectFeatures::ScopeToFeatureIndices{
++        inverseMap<ACPOCollectFeatures::Scope,
++                   ACPOCollectFeatures::FeatureIndex>(FeatureIndexToScope)};
++
++#define REGISTER_FUNCTION(INDEX_NAME, NAME)                                    \
++  { ACPOCollectFeatures::FeatureIndex::INDEX_NAME, NAME }
++const std::unordered_map<ACPOCollectFeatures::FeatureIndex,
++                         ACPOCollectFeatures::CalculateFeatureFunction>
++    ACPOCollectFeatures::CalculateFeatureMap{
++        REGISTER_FUNCTION(SROASavings, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(SROALosses, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(LoadElimination, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(CallPenalty, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(CallArgumentSetup, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(LoadRelativeIntrinsic, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(LoweredCallArgSetup, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(IndirectCallPenalty, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(JumpTablePenalty, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(CaseClusterPenalty, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(SwitchPenalty, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(UnsimplifiedCommonInstructions,
++                          calculateInlineCostFeatures),
++        REGISTER_FUNCTION(NumLoops, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(DeadBlocks, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(SimplifiedInstructions, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(ConstantArgs, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(ConstantOffsetPtrArgs, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(CallSiteCost, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(ColdCcPenalty, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(LastCallToStaticBonus, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(IsMultipleBlocks, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(NestedInlines, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(NestedInlineCostEstimate,
++                          calculateInlineCostFeatures),
++        REGISTER_FUNCTION(Threshold, calculateInlineCostFeatures),
++        REGISTER_FUNCTION(BasicBlockCount, calculateFPIRelated),
++        REGISTER_FUNCTION(BlocksReachedFromConditionalInstruction,
++                          calculateFPIRelated),
++        REGISTER_FUNCTION(Uses, calculateFPIRelated),
++        REGISTER_FUNCTION(EdgeCount, calculateEdgeNodeCount),
++        REGISTER_FUNCTION(NodeCount, calculateEdgeNodeCount),
++        REGISTER_FUNCTION(ColdCallSite, calculateHotColdCallSite),
++        REGISTER_FUNCTION(HotCallSite, calculateHotColdCallSite),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesInitialSize,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesBlocks,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesCalls,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesIsLocal,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesIsLinkOnceODR,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesIsLinkOnce,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesLoops,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesMaxLoopDepth,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesMaxDomTreeLevel,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesPtrArgs,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesPtrCallee,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesCallReturnPtr,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesConditionalBranch,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesCBwithArg,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesCallerHeight,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesCallUsage,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesIsRecursive,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesNumCallsiteInLoop,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesNumOfCallUsesInLoop,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesEntryBlockFreq,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesMaxCallsiteBlockFreq,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesInstructionPerBlock,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesSuccessorPerBlock,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesAvgVecInstr,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesAvgNestedLoopLevel,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(ACPOFIExtendedFeaturesInstrPerLoop,
++                          calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(
++            ACPOFIExtendedFeaturesBlockWithMultipleSuccecorsPerLoop,
++            calculateACPOFIExtendedFeaturesFeatures),
++        REGISTER_FUNCTION(CallerBlockFreq, calculateCallerBlockFreq),
++        REGISTER_FUNCTION(CallSiteHeight, calculateCallSiteHeight),
++        REGISTER_FUNCTION(ConstantParam, calculateConstantParam),
++        REGISTER_FUNCTION(CostEstimate, calculateCostEstimate),
++        REGISTER_FUNCTION(LoopLevel, calculateLoopLevel),
++        REGISTER_FUNCTION(MandatoryKind, calculateMandatoryKind),
++        REGISTER_FUNCTION(MandatoryOnly, calculateMandatoryOnly),
++        REGISTER_FUNCTION(OptCode, calculateOptCode),
++        REGISTER_FUNCTION(IsIndirectCall, calculateIsIndirectCall),
++        REGISTER_FUNCTION(IsInInnerLoop, calculateIsInInnerLoop),
++        REGISTER_FUNCTION(IsMustTailCall, calculateIsMustTailCall),
++        REGISTER_FUNCTION(IsTailCall, calculateIsTailCall),
++    };
++#undef REGISTER_FUNCTION
++
++std::map<const Function *, unsigned> ACPOCollectFeatures::FunctionLevels{};
++
++ACPOCollectFeatures::ACPOCollectFeatures() {}
++
++ACPOCollectFeatures::ACPOCollectFeatures(
++    ACPOCollectFeatures::FeatureInfo GlobalInfo)
++    : GlobalFeatureInfo(GlobalInfo) {
++  assert(GlobalFeatureInfo.Idx == FeatureIndex::NumOfFeatures &&
++         "When setting glboal FeatureInfo the Idx should always be "
++         "NumOfFeatures");
++}
++
++ACPOCollectFeatures::~ACPOCollectFeatures() {}
++
++void ACPOCollectFeatures::setFeatureValue(ACPOCollectFeatures::FeatureIndex Idx,
++                                          std::string Val) {
++  FeatureToValue[Idx] = Val;
++}
++
++void ACPOCollectFeatures::setFeatureInfo(
++    ACPOCollectFeatures::FeatureIndex Idx,
++    ACPOCollectFeatures::FeatureInfo Info) {
++  assert(
++      (Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++       Info.Idx == Idx || getFeatureGroup(Info.Idx) == getFeatureGroup(Idx)) &&
++      "When setting FeatureToInfo map the key and value pair should both refer "
++      "to the same Feature or the FeatureInfo.Idx should be NumOfFeatures.");
++  FeatureToInfo[Idx] = Info;
++}
++
++void ACPOCollectFeatures::setFeatureValueAndInfo(
++    ACPOCollectFeatures::FeatureIndex Idx,
++    ACPOCollectFeatures::FeatureInfo Info, std::string Val) {
++  setFeatureValue(Idx, Val);
++  setFeatureInfo(Idx, Info);
++}
++
++void ACPOCollectFeatures::setGlobalFeatureInfo(
++    ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == FeatureIndex::NumOfFeatures &&
++         "When setting glboal FeatureInfo the Idx should always be "
++         "NumOfFeatures");
++  GlobalFeatureInfo = Info;
++}
++
++std::string
++ACPOCollectFeatures::getFeature(ACPOCollectFeatures::FeatureIndex Idx) const {
++  assert(registeredFeature(Idx) && "Feature not registered");
++  return FeatureToValue.find(Idx)->second;
++}
++
++std::string
++ACPOCollectFeatures::getFeatureName(ACPOCollectFeatures::FeatureIndex Idx) {
++  return FeatureIndexToName.find(Idx)->second;
++}
++
++ACPOCollectFeatures::GroupID
++ACPOCollectFeatures::getFeatureGroup(ACPOCollectFeatures::FeatureIndex Idx) {
++  return FeatureIndexToGroup.find(Idx)->second;
++}
++
++ACPOCollectFeatures::Scope
++ACPOCollectFeatures::getFeatureScope(ACPOCollectFeatures::FeatureIndex Idx) {
++  return FeatureIndexToScope.find(Idx)->second;
++}
++
++std::set<ACPOCollectFeatures::FeatureIndex>
++ACPOCollectFeatures::getGroupFeatures(ACPOCollectFeatures::GroupID Group) {
++  std::set<ACPOCollectFeatures::FeatureIndex> FeatureIndices;
++  auto Range = GroupToFeatureIndices.equal_range(Group);
++  for (auto It = Range.first; It != Range.second; ++It) {
++    FeatureIndices.insert(It->second);
++  }
++  return FeatureIndices;
++}
++
++std::set<ACPOCollectFeatures::FeatureIndex>
++ACPOCollectFeatures::getScopeFeatures(ACPOCollectFeatures::Scope S) {
++  std::set<ACPOCollectFeatures::FeatureIndex> FeatureIndices;
++  auto Range = ScopeToFeatureIndices.equal_range(S);
++  for (auto It = Range.first; It != Range.second; ++It) {
++    FeatureIndices.insert(It->second);
++  }
++  return FeatureIndices;
++}
++
++bool ACPOCollectFeatures::containsFeature(
++    ACPOCollectFeatures::FeatureIndex Idx) {
++  return FeatureToValue.count(Idx) > 0;
++}
++
++bool ACPOCollectFeatures::containsFeature(
++    ACPOCollectFeatures::GroupID GroupID) {
++  for (auto FeatureIdx : getGroupFeatures(GroupID)) {
++    if (!containsFeature(FeatureIdx))
++      return false;
++  }
++  return true;
++}
++
++void ACPOCollectFeatures::clearFeatureValueMap() { FeatureToValue.clear(); }
++
++bool ACPOCollectFeatures::registeredFeature(
++    ACPOCollectFeatures::FeatureIndex Idx) const {
++  return FeatureToValue.find(Idx) != FeatureToValue.end();
++}
++
++void calculateFPIRelated(ACPOCollectFeatures &ACF,
++                         const ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++         Info.Idx == ACPOCollectFeatures::FeatureIndex::BasicBlockCount);
++
++  auto *FAM = Info.Managers.FAM;
++  auto *F = Info.SI.F;
++
++  assert(F && FAM && "Function or FAM is nullptr");
++
++  auto &FPI = FAM->getResult<FunctionPropertiesAnalysis>(*F);
++
++  ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::BasicBlockCount,
++                             Info, std::to_string(FPI.BasicBlockCount));
++  ACF.setFeatureValueAndInfo(
++      ACPOCollectFeatures::FeatureIndex::
++          BlocksReachedFromConditionalInstruction,
++      Info, std::to_string(FPI.BlocksReachedFromConditionalInstruction));
++  ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::Uses, Info,
++                             std::to_string(FPI.Uses));
++}
++
++void calculateCallerBlockFreq(ACPOCollectFeatures &ACF,
++                              const ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++         Info.Idx == ACPOCollectFeatures::FeatureIndex::CallerBlockFreq);
++
++  auto *CB = Info.SI.CB;
++  auto *FAM = Info.Managers.FAM;
++
++  assert(CB && FAM && "CallSite or FAM is nullptr");
++
++  Function *F = CB->getCaller();
++  BasicBlock *BB = CB->getParent();
++  BlockFrequencyInfo &BFI = FAM->getResult<BlockFrequencyAnalysis>(*F);
++
++  uint64_t CallerBlockFreq = BFI.getBlockFreq(BB).getFrequency();
++  // The model uses signed 64-bit thus we need to take care of int overflow.
++  if (CallerBlockFreq >= std::numeric_limits<int64_t>::max()) {
++    CallerBlockFreq = std::numeric_limits<int64_t>::max() - 1;
++  }
++
++  ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::CallerBlockFreq,
++                             Info, std::to_string(CallerBlockFreq));
++}
++
++void calculateCallSiteHeight(ACPOCollectFeatures &ACF,
++                             const ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++         Info.Idx == ACPOCollectFeatures::FeatureIndex::CallSiteHeight);
++
++  // Check if we already calculated the values.
++  if (ACF.containsFeature(ACPOCollectFeatures::FeatureIndex::CallSiteHeight))
++    return;
++
++  auto *CB = Info.SI.CB;
++  auto *IA = Info.OI.IA;
++
++  assert(CB && IA && "CallSite or IA is nullptr");
++
++  if (IA) {
++    ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::CallSiteHeight,
++                              Info, std::to_string(IA->getCallSiteHeight(CB)));
++    return;
++  }
++  LLVM_DEBUG(dbgs() << "IA was nullptr & callsite height is not set!" << "\n");
++}
++
++void calculateConstantParam(ACPOCollectFeatures &ACF,
++                            const ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++         Info.Idx == ACPOCollectFeatures::FeatureIndex::ConstantParam);
++
++  // Check if we already calculated the values.
++  if (ACF.containsFeature(ACPOCollectFeatures::FeatureIndex::ConstantParam))
++    return;
++
++  auto *CB = Info.SI.CB;
++  assert(CB && "CallSite is nullptr");
++
++  size_t NrCtantParams = 0;
++  for (auto I = CB->arg_begin(), E = CB->arg_end(); I != E; ++I) {
++    NrCtantParams += (isa<Constant>(*I));
++  }
++
++  ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::ConstantParam,
++                             Info, std::to_string(NrCtantParams));
++}
++
++void calculateCostEstimate(ACPOCollectFeatures &ACF,
++                           const ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++         Info.Idx == ACPOCollectFeatures::FeatureIndex::CostEstimate);
++
++  // Check if we already calculated the values.
++  if (ACF.containsFeature(ACPOCollectFeatures::FeatureIndex::CostEstimate))
++    return;
++
++  auto *CB = Info.SI.CB;
++  auto *FAM = Info.Managers.FAM;
++
++  assert(CB && FAM && "CallBase or FAM is nullptr");
++
++  auto &Callee = *CB->getCalledFunction();
++  auto &TIR = FAM->getResult<TargetIRAnalysis>(Callee);
++
++  auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
++    return FAM->getResult<AssumptionAnalysis>(F);
++  };
++
++  int CostEstimate = 0;
++  auto IsCallSiteInlinable =
++      llvm::getInliningCostEstimate(*CB, TIR, GetAssumptionCache);
++  if (IsCallSiteInlinable)
++    CostEstimate = *IsCallSiteInlinable;
++
++  ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::CostEstimate,
++                             Info, std::to_string(CostEstimate));
++}
++
++int64_t getLocalCalls(Function &F, FunctionAnalysisManager &FAM) {
++  return FAM.getResult<FunctionPropertiesAnalysis>(F)
++      .DirectCallsToDefinedFunctions;
++}
++
++void calculateEdgeNodeCount(ACPOCollectFeatures &ACF,
++                            const ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++         ACPOCollectFeatures::getFeatureGroup(Info.Idx) ==
++             ACPOCollectFeatures::GroupID::EdgeNodeCount);
++
++  // Check if we already calculated the values.
++  if (ACF.containsFeature(ACPOCollectFeatures::GroupID::EdgeNodeCount))
++    return;
++
++  auto *M = Info.SI.M;
++  auto *FAM = Info.Managers.FAM;
++
++  assert(M && FAM && "Module or FAM is nullptr");
++
++  int NodeCount = 0;
++  int EdgeCount = 0;
++  for (auto &F : *M)
++    if (!F.isDeclaration()) {
++      ++NodeCount;
++      EdgeCount += getLocalCalls(F, *FAM);
++    }
++
++  std::string EdgeCountStr = std::to_string(EdgeCount);
++  std::string NodeCountStr = std::to_string(NodeCount);
++  ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::EdgeCount, Info,
++                             EdgeCountStr);
++  ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::NodeCount, Info,
++                             NodeCountStr);
++}
++
++void calculateHotColdCallSite(ACPOCollectFeatures &ACF,
++                              const ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++         ACPOCollectFeatures::getFeatureGroup(Info.Idx) ==
++             ACPOCollectFeatures::GroupID::HotColdCallSite);
++
++  // Check if we already calculated the values.
++  if (ACF.containsFeature(ACPOCollectFeatures::GroupID::HotColdCallSite))
++    return;
++
++  auto *CB = Info.SI.CB;
++  auto *FAM = Info.Managers.FAM;
++
++  assert(CB && FAM && "Module or FAM is nullptr");
++
++  auto &Caller = *CB->getCaller();
++  auto GetBFI = [&](Function &F) -> BlockFrequencyInfo & {
++    return FAM->getResult<BlockFrequencyAnalysis>(F);
++  };
++
++  BlockFrequencyInfo &CallerBFI = GetBFI(Caller);
++  const BranchProbability ColdProb(2, 100);
++  auto *CallSiteBB = CB->getParent();
++  auto CallSiteFreq = CallerBFI.getBlockFreq(CallSiteBB);
++  auto CallerEntryFreq =
++      CallerBFI.getBlockFreq(&(CB->getCaller()->getEntryBlock()));
++  bool ColdCallSite = CallSiteFreq < CallerEntryFreq * ColdProb;
++  auto CallerEntryFreqHot = CallerBFI.getEntryFreq();
++  bool HotCallSite = (CallSiteFreq.getFrequency() >= CallerEntryFreqHot * 60);
++
++  ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::ColdCallSite,
++                             Info, std::to_string(ColdCallSite));
++  ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::HotCallSite,
++                             Info, std::to_string(HotCallSite));
++}
++
++void calculateLoopLevel(ACPOCollectFeatures &ACF,
++                        const ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++         Info.Idx == ACPOCollectFeatures::FeatureIndex::LoopLevel);
++
++  // Check if we already calculated the values.
++  if (ACF.containsFeature(ACPOCollectFeatures::FeatureIndex::LoopLevel))
++    return;
++
++  auto *CB = Info.SI.CB;
++  auto *FAM = Info.Managers.FAM;
++
++  assert(CB && FAM && "CallBase or FAM is nullptr");
++
++  Function *F = CB->getCaller();
++  BasicBlock *BB = CB->getParent();
++  LoopInfo &LI = FAM->getResult<LoopAnalysis>(*F);
++
++  std::string OptCode = std::to_string(CB->getOpcode());
++  ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::LoopLevel, Info,
++                             std::to_string(LI.getLoopDepth(BB)));
++}
++
++InlineAdvisor::MandatoryInliningKind
++ACPOCollectFeatures::getMandatoryKind(CallBase &CB,
++                                      FunctionAnalysisManager &FAM,
++                                      OptimizationRemarkEmitter &ORE) {
++  return InlineAdvisor::getMandatoryKind(CB, FAM, ORE);
++}
++
++void calculateMandatoryKind(ACPOCollectFeatures &ACF,
++                            const ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++         Info.Idx == ACPOCollectFeatures::FeatureIndex::MandatoryKind);
++
++  // Check if we already calculated the values.
++  if (ACF.containsFeature(ACPOCollectFeatures::FeatureIndex::MandatoryKind))
++    return;
++
++  auto *CB = Info.SI.CB;
++  auto *FAM = Info.Managers.FAM;
++
++  assert(CB && FAM && "CallBase or FAM is nullptr");
++
++  auto &Caller = *CB->getCaller();
++  auto &ORE = FAM->getResult<OptimizationRemarkEmitterAnalysis>(Caller);
++  auto MandatoryKind = ACPOCollectFeatures::getMandatoryKind(*CB, *FAM, ORE);
++
++  ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::MandatoryKind,
++                             Info, std::to_string((int)MandatoryKind));
++}
++
++void calculateMandatoryOnly(ACPOCollectFeatures &ACF,
++                            const ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++         Info.Idx == ACPOCollectFeatures::FeatureIndex::MandatoryOnly);
++
++  // Check if we already calculated the values.
++  if (ACF.containsFeature(ACPOCollectFeatures::FeatureIndex::MandatoryOnly))
++    return;
++
++  ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::MandatoryOnly,
++                             Info, std::to_string((int)Info.OI.MandatoryOnly));
++}
++
++void calculateOptCode(ACPOCollectFeatures &ACF,
++                      const ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++         Info.Idx == ACPOCollectFeatures::FeatureIndex::OptCode);
++
++  // Check if we already calculated the values.
++  if (ACF.containsFeature(ACPOCollectFeatures::FeatureIndex::OptCode))
++    return;
++
++  auto *CB = Info.SI.CB;
++
++  assert(CB && "CallBase is nullptr");
++
++  std::string OptCode = std::to_string(CB->getOpcode());
++  ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::OptCode, Info,
++                             OptCode);
++}
++
++void calculateInlineCostFeatures(ACPOCollectFeatures &ACF,
++                                 const ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++         (ACPOCollectFeatures::getFeatureGroup(Info.Idx) ==
++          ACPOCollectFeatures::GroupID::InlineCostFeatureGroup));
++
++  // Check if we already calculated the values.
++  if (ACF.containsFeature(ACPOCollectFeatures::GroupID::InlineCostFeatureGroup))
++    return;
++
++  auto *CB = Info.SI.CB;
++  auto *FAM = Info.Managers.FAM;
++
++  assert(CB && FAM && "CallBase or FAM is nullptr");
++
++  auto &Callee = *CB->getCalledFunction();
++  auto &TIR = FAM->getResult<TargetIRAnalysis>(Callee);
++
++  auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
++    return FAM->getResult<AssumptionAnalysis>(F);
++  };
++
++  const auto CostFeaturesOpt =
++      getInliningCostFeatures(*CB, TIR, GetAssumptionCache);
++
++  for (auto Idx =
++           ACPOCollectFeatures::FeatureIndex::InlineCostFeatureGroupBegin + 1;
++       Idx != ACPOCollectFeatures::FeatureIndex::InlineCostFeatureGroupEnd;
++       ++Idx) {
++    size_t TmpIdx =
++        static_cast<size_t>(Idx) -
++        static_cast<size_t>(
++            ACPOCollectFeatures::FeatureIndex::InlineCostFeatureGroupBegin) -
++        1;
++    ACF.setFeatureValueAndInfo(
++        Idx, Info,
++        std::to_string(CostFeaturesOpt ? CostFeaturesOpt.value()[TmpIdx] : 0));
++  }
++}
++
++static void
++checkValidFFCache(Function &F,
++                  struct ACPOFIExtendedFeatures::FunctionFeatures &FF,
++                  DominatorTree &Tree, TargetTransformInfo &TTI, LoopInfo &LI,
++                  bool &ValidSize, bool &ValidLoop, bool &ValidTree) {
++  std::optional<size_t> SizeCache = ACPOFIModel::getCachedSize(
++      &F, ACPOFIExtendedFeatures::NamedFeatureIndex::InitialSize);
++  auto TTIAnalysisCache = ACPOFIModel::getTTICachedAnalysis(&F);
++  if (SizeCache && TTIAnalysisCache == &TTI) {
++    ValidSize = true;
++  }
++
++  std::optional<size_t> MaxDomTreeLevelCache = ACPOFIModel::getCachedSize(
++      &F, ACPOFIExtendedFeatures::NamedFeatureIndex::MaxDomTreeLevel);
++  auto DomCache = ACPOFIModel::getDomCachedAnalysis(&F);
++  if (MaxDomTreeLevelCache && DomCache == &Tree) {
++    ValidTree = true;
++  }
++
++  std::optional<size_t> LoopNumCache = ACPOFIModel::getCachedSize(
++      &F, ACPOFIExtendedFeatures::NamedFeatureIndex::Loops);
++  auto LIAnalysisCache = ACPOFIModel::getLICachedAnalysis(&F);
++  if (LoopNumCache && LIAnalysisCache == &LI) {
++    ValidLoop = true;
++  }
++}
++
++static void getCachedFF(Function &F,
++                        struct ACPOFIExtendedFeatures::FunctionFeatures &FF,
++                        DominatorTree &Tree, TargetTransformInfo &TTI,
++                        LoopInfo &LI) {
++  std::optional<size_t> SizeCache = ACPOFIModel::getCachedSize(
++      &F, ACPOFIExtendedFeatures::NamedFeatureIndex::InitialSize);
++  auto TTIAnalysisCache = ACPOFIModel::getTTICachedAnalysis(&F);
++  if (SizeCache && TTIAnalysisCache == &TTI) {
++    FF[ACPOFIExtendedFeatures::NamedFeatureIndex::InitialSize] =
++        SizeCache.value();
++  }
++
++  std::optional<size_t> MaxDomTreeLevelCache = ACPOFIModel::getCachedSize(
++      &F, ACPOFIExtendedFeatures::NamedFeatureIndex::MaxDomTreeLevel);
++  auto DomCache = ACPOFIModel::getDomCachedAnalysis(&F);
++  if (MaxDomTreeLevelCache && DomCache == &Tree) {
++    FF[ACPOFIExtendedFeatures::NamedFeatureIndex::MaxDomTreeLevel] =
++        MaxDomTreeLevelCache.value();
++  }
++
++  std::optional<size_t> LoopNumCache = ACPOFIModel::getCachedSize(
++      &F, ACPOFIExtendedFeatures::NamedFeatureIndex::Loops);
++  auto LIAnalysisCache = ACPOFIModel::getLICachedAnalysis(&F);
++  if (LoopNumCache && LIAnalysisCache == &LI) {
++    FF[ACPOFIExtendedFeatures::NamedFeatureIndex::Loops] = LoopNumCache.value();
++    FF[ACPOFIExtendedFeatures::NamedFeatureIndex::MaxLoopDepth] =
++        ACPOFIModel::getCachedSize(
++            &F, ACPOFIExtendedFeatures::NamedFeatureIndex::MaxLoopDepth)
++            .value();
++    if (LoopNumCache.value() != 0) {
++      FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::InstrPerLoop] =
++          ACPOFIModel::getCachedFloat(
++              &F, ACPOFIExtendedFeatures::NamedFloatFeatureIndex::InstrPerLoop)
++              .value();
++      FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::
++             BlockWithMultipleSuccecorsPerLoop] =
++          ACPOFIModel::getCachedFloat(
++              &F, ACPOFIExtendedFeatures::NamedFloatFeatureIndex::
++                      BlockWithMultipleSuccecorsPerLoop)
++              .value();
++      FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::AvgNestedLoopLevel] =
++          ACPOFIModel::getCachedFloat(
++              &F, ACPOFIExtendedFeatures::NamedFloatFeatureIndex::
++                      AvgNestedLoopLevel)
++              .value();
++    }
++  }
++}
++
++static void updateCachedFF(Function &F,
++                           struct ACPOFIExtendedFeatures::FunctionFeatures &FF,
++                           DominatorTree &Tree, TargetTransformInfo &TTI,
++                           LoopInfo &LI) {
++  ACPOFIModel::insertSizeCache(
++      &F, ACPOFIExtendedFeatures::NamedFeatureIndex::InitialSize,
++      FF[ACPOFIExtendedFeatures::NamedFeatureIndex::InitialSize]);
++  ACPOFIModel::insertAnalysisCache(&F, &TTI);
++  ACPOFIModel::insertSizeCache(
++      &F, ACPOFIExtendedFeatures::NamedFeatureIndex::MaxDomTreeLevel,
++      FF[ACPOFIExtendedFeatures::NamedFeatureIndex::MaxDomTreeLevel]);
++  ACPOFIModel::insertAnalysisCache(&F, &Tree);
++  ACPOFIModel::insertSizeCache(
++      &F, ACPOFIExtendedFeatures::NamedFeatureIndex::Loops,
++      FF[ACPOFIExtendedFeatures::NamedFeatureIndex::Loops]);
++  ACPOFIModel::insertSizeCache(
++      &F, ACPOFIExtendedFeatures::NamedFeatureIndex::MaxLoopDepth,
++      FF[ACPOFIExtendedFeatures::NamedFeatureIndex::MaxLoopDepth]);
++  ACPOFIModel::insertFloatCache(
++      &F, ACPOFIExtendedFeatures::NamedFloatFeatureIndex::InstrPerLoop,
++      FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::InstrPerLoop]);
++  ACPOFIModel::insertFloatCache(
++      &F,
++      ACPOFIExtendedFeatures::NamedFloatFeatureIndex::
++          BlockWithMultipleSuccecorsPerLoop,
++      FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::
++             BlockWithMultipleSuccecorsPerLoop]);
++  ACPOFIModel::insertFloatCache(
++      &F, ACPOFIExtendedFeatures::NamedFloatFeatureIndex::AvgNestedLoopLevel,
++      FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::AvgNestedLoopLevel]);
++  ACPOFIModel::insertAnalysisCache(&F, &LI);
++}
++
++void calculateACPOFIExtendedFeaturesFeatures(
++    ACPOCollectFeatures &ACF, const ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++         ACPOCollectFeatures::getFeatureGroup(Info.Idx) ==
++             ACPOCollectFeatures::GroupID::ACPOFIExtendedFeatures);
++
++  // Check if we already calculated the values.
++  if (ACF.containsFeature(ACPOCollectFeatures::GroupID::ACPOFIExtendedFeatures))
++    return;
++
++  auto F = Info.SI.F;
++  auto *FAM = Info.Managers.FAM;
++
++  assert(F && FAM && "F or FAM is nullptr");
++
++  struct ACPOFIExtendedFeatures::FunctionFeatures FF;
++  auto &DomTree = FAM->getResult<DominatorTreeAnalysis>(*F);
++  auto &TTI = FAM->getResult<TargetIRAnalysis>(*F);
++  auto &LI = FAM->getResult<LoopAnalysis>(*F);
++  bool ValidSize = false;
++  bool ValidLoop = false;
++  bool ValidTree = false;
++  checkValidFFCache(*F, FF, DomTree, TTI, LI, ValidSize, ValidLoop, ValidTree);
++  FF = ACPOFIExtendedFeatures::getFunctionFeatures(
++      *F, DomTree, TTI, LI, FAM, ValidSize, ValidLoop, ValidTree);
++  getCachedFF(*F, FF, DomTree, TTI, LI);
++  updateCachedFF(*F, FF, DomTree, TTI, LI);
++
++  for (auto Idx = ACPOCollectFeatures::FeatureIndex::
++                      ACPOFIExtendedFeaturesNamedFeatureBegin +
++                  1;
++       Idx !=
++       ACPOCollectFeatures::FeatureIndex::ACPOFIExtendedFeaturesNamedFeatureEnd;
++       ++Idx) {
++    size_t TmpIdx =
++        static_cast<size_t>(Idx) -
++        static_cast<size_t>(ACPOCollectFeatures::FeatureIndex::
++                                ACPOFIExtendedFeaturesNamedFeatureBegin) -
++        1;
++    ACF.setFeatureValueAndInfo(Idx, Info,
++                               std::to_string(FF.NamedFeatures[TmpIdx]));
++  }
++  for (auto Idx = ACPOCollectFeatures::FeatureIndex::
++                      ACPOFIExtendedFeaturesFloatFeatureBegin +
++                  1;
++       Idx !=
++       ACPOCollectFeatures::FeatureIndex::ACPOFIExtendedFeaturesFloatFeatureEnd;
++       ++Idx) {
++    size_t TmpIdx =
++        static_cast<size_t>(Idx) -
++        static_cast<size_t>(ACPOCollectFeatures::FeatureIndex::
++                                ACPOFIExtendedFeaturesFloatFeatureBegin) -
++        1;
++    ACF.setFeatureValueAndInfo(Idx, Info,
++                               std::to_string(FF.NamedFloatFeatures[TmpIdx]));
++  }
++}
++
++void calculateIsIndirectCall(ACPOCollectFeatures &ACF,
++                             const ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++         Info.Idx == ACPOCollectFeatures::FeatureIndex::IsIndirectCall);
++
++  // Check if we already calculated the values.
++  if (ACF.containsFeature(ACPOCollectFeatures::FeatureIndex::IsIndirectCall))
++    return;
++
++  auto *CB = Info.SI.CB;
++
++  assert(CB && "CallBase is nullptr");
++
++  ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsIndirectCall,
++                             Info, std::to_string(CB->isIndirectCall()));
++}
++
++void calculateIsInInnerLoop(ACPOCollectFeatures &ACF,
++                            const ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++         Info.Idx == ACPOCollectFeatures::FeatureIndex::IsInInnerLoop);
++
++  // Check if we already calculated the values.
++  if (ACF.containsFeature(ACPOCollectFeatures::FeatureIndex::IsInInnerLoop))
++    return;
++
++  auto *CB = Info.SI.CB;
++  auto *FAM = Info.Managers.FAM;
++
++  assert(CB && FAM && "CallBase or FAM is nullptr");
++
++  auto &Caller = *CB->getCaller();
++  auto &CallerLI = FAM->getResult<LoopAnalysis>(Caller);
++
++  // Get loop for CB's BB. And check whether the loop is an inner most loop.
++  bool CallSiteInInnerLoop = false;
++  for (auto &L : CallerLI) {
++    if (L->isInnermost() && L->contains(CB))
++      CallSiteInInnerLoop = true;
++  }
++
++  ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsInInnerLoop,
++                             Info, std::to_string(CallSiteInInnerLoop));
++}
++
++void calculateIsMustTailCall(ACPOCollectFeatures &ACF,
++                             const ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++         Info.Idx == ACPOCollectFeatures::FeatureIndex::IsMustTailCall);
++
++  // Check if we already calculated the values.
++  if (ACF.containsFeature(ACPOCollectFeatures::FeatureIndex::IsMustTailCall))
++    return;
++
++  auto *CB = Info.SI.CB;
++
++  assert(CB && "CallBase is nullptr");
++
++  ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsMustTailCall,
++                             Info, std::to_string(CB->isMustTailCall()));
++}
++
++void calculateIsTailCall(ACPOCollectFeatures &ACF,
++                         const ACPOCollectFeatures::FeatureInfo &Info) {
++  assert(Info.Idx == ACPOCollectFeatures::FeatureIndex::NumOfFeatures ||
++         Info.Idx == ACPOCollectFeatures::FeatureIndex::IsTailCall);
++
++  // Check if we already calculated the values.
++  if (ACF.containsFeature(ACPOCollectFeatures::FeatureIndex::IsTailCall))
++    return;
++
++  auto *CB = Info.SI.CB;
++
++  assert(CB && "CallBase is nullptr");
++
++  ACF.setFeatureValueAndInfo(ACPOCollectFeatures::FeatureIndex::IsTailCall,
++                             Info, std::to_string(CB->isTailCall()));
++}
++
++ACPOCollectFeatures::FeatureValueMap ACPOCollectFeatures::getFeaturesPair(
++    ACPOCollectFeatures::FeaturesInfo FeatureInfoVec) {
++  clearFeatureValueMap();
++  for (auto &FeatureInfo : FeatureInfoVec) {
++    auto It = CalculateFeatureMap.find(FeatureInfo.Idx);
++    if (It == CalculateFeatureMap.end()) {
++      assert("Could not find the corresponding function to calculate feature");
++    }
++    auto CalculateFunction = It->second;
++    CalculateFunction(*this, FeatureInfo);
++    LLVM_DEBUG(dbgs() << "ACPO Feature " << getFeatureName(FeatureInfo.Idx)
++                                         << ": " << FeatureToValue[FeatureInfo.Idx] << "\n");
++  }
++
++  return FeatureToValue;
++}
++
++ACPOCollectFeatures::FeatureValueMap
++ACPOCollectFeatures::getFeaturesPair(ACPOCollectFeatures::Scopes ScopeVec) {
++  clearFeatureValueMap();
++  for (auto Scope : ScopeVec) {
++    for (auto FeatureIdx : getScopeFeatures(Scope)) {
++      auto It = CalculateFeatureMap.find(FeatureIdx);
++      if (It == CalculateFeatureMap.end()) {
++        assert(
++            "Could not find the corresponding function to calculate feature");
++      }
++      auto CalculateFunction = It->second;
++      CalculateFunction(*this, GlobalFeatureInfo);
++      LLVM_DEBUG(dbgs() << "ACPO Feature " << getFeatureName(FeatureIdx)
++                                           << ": " << FeatureToValue[FeatureIdx] << "\n");
++    }
++  }
++
++  return FeatureToValue;
++}
++
++ACPOCollectFeatures::FeatureValueMap
++ACPOCollectFeatures::getFeaturesPair(ACPOCollectFeatures::GroupIDs GroupIDVec) {
++  clearFeatureValueMap();
++  for (auto GroupID : GroupIDVec) {
++    for (auto FeatureIdx : getGroupFeatures(GroupID)) {
++      auto It = CalculateFeatureMap.find(FeatureIdx);
++      if (It == CalculateFeatureMap.end()) {
++        assert(
++            "Could not find the corresponding function to calculate feature");
++      }
++      auto CalculateFunction = It->second;
++      CalculateFunction(*this, GlobalFeatureInfo);
++      LLVM_DEBUG(dbgs() << "ACPO Feature " << getFeatureName(FeatureIdx)
++                                           << ": " << FeatureToValue[FeatureIdx] << "\n");
++    }
++  }
++
++  return FeatureToValue;
++}
++
++ACPOCollectFeatures::FeatureValueMap
++ACPOCollectFeatures::getFeaturesPair(ACPOCollectFeatures::FeatureIndex Beg,
++                                     ACPOCollectFeatures::FeatureIndex End) {
++  assert(Beg <= End);
++  for (auto Idx = Beg; Idx != End; ++Idx) {
++    auto It = CalculateFeatureMap.find(Idx);
++    if (It == CalculateFeatureMap.end()) {
++      assert("Could not find the corresponding function to calculate feature");
++    }
++    auto CalculateFunction = It->second;
++    CalculateFunction(*this, GlobalFeatureInfo);
++  }
++
++  return FeatureToValue;
++}
++
++void ACPOCollectFeatures::clearFunctionLevel() { FunctionLevels.clear(); }
++
++void ACPOCollectFeatures::insertFunctionLevel(const Function *F, unsigned FL) {
++  FunctionLevels[F] = FL;
++}
++
++std::optional<unsigned>
++ACPOCollectFeatures::getFunctionLevel(const Function *F) {
++  auto It = FunctionLevels.find(F);
++  if (It == FunctionLevels.end()) {
++    return std::nullopt;
++  } else {
++    return It->second;
++  }
++}
++
++ACPOCollectFeatures::FeatureIndex operator+(ACPOCollectFeatures::FeatureIndex N,
++                                            int Counter) {
++  return static_cast<ACPOCollectFeatures::FeatureIndex>((int)N + Counter);
++}
++
++ACPOCollectFeatures::FeatureIndex operator-(ACPOCollectFeatures::FeatureIndex N,
++                                            int Counter) {
++  return static_cast<ACPOCollectFeatures::FeatureIndex>((int)N - Counter);
++}
++
++ACPOCollectFeatures::FeatureIndex &
++operator++(ACPOCollectFeatures::FeatureIndex &N) {
++  return N = static_cast<ACPOCollectFeatures::FeatureIndex>((int)N + 1);
++}
++
++ACPOCollectFeatures::FeatureIndex
++operator++(ACPOCollectFeatures::FeatureIndex &N, int) {
++  ACPOCollectFeatures::FeatureIndex Res = N;
++  ++N;
++  return Res;
++}
++
++} // namespace llvm
+diff --git a/llvm/lib/Analysis/ACPOMLInterface.cpp b/llvm/lib/Analysis/ACPOMLInterface.cpp
+new file mode 100644
+index 000000000000..271dcfe7d851
+--- /dev/null
++++ b/llvm/lib/Analysis/ACPOMLInterface.cpp
+@@ -0,0 +1,1405 @@
++//===- ACPOMLInterface.cpp - AI-Enabled Continuous Program Optimization ---===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++//
++// This file implements an interface to the ML framework.
++//
++//===----------------------------------------------------------------------===//
++
++#include "llvm/Analysis/ACPOMLInterface.h"
++#include "llvm/Analysis/ACPOModelRunner.h"
++#include "llvm/Analysis/FIModelRunner.h"
++#include "llvm/Analysis/TensorSpec.h"
++#include "llvm/Support/Process.h"
++#include "llvm/Support/Program.h"
++#include "llvm/Support/raw_ostream.h"
++
++#include <ctime>
++#include <fstream>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string>
++#include <vector>
++
++#ifdef _WIN32
++#include <Windows.h>
++#else
++#include <unistd.h>
++#endif
++
++using namespace llvm;
++
++#define DEBUG_TYPE "acpo"
++
++#define ACPO_ENV_VAR_DIR "ACPO_DIR"
++#define ACPO_ML_PYTHON_INTERFACE_PY "MLInterface.py"
++#define ACPO_PYTHON_EXECUTABLE "python"
++#define ACPO_PIPE_PREFIX "ACPO_Pipe"
++
++#define RESPONSE_MODEL_LOADED "Model loaded"
++#define RESPONSE_ALREADY_IN_DICT "already in dict"
++#define RESPONSE_FEATURE_SET "Feature set"
++#define RESPONSE_FEATURES_INITIALIZED "Features initialized"
++#define RESPONSE_FEATURES_SET "Features set"
++#define RESPONSE_COMPLETED "Completed"
++#define RESPONSE_ACTIVE "Active"
++#define RESPONSE_ERROR "ERROR"
++
++// Static variables
++
++static std::shared_ptr<ACPOMLInterface> PersistentMLIF = nullptr;
++
++// Class definitions
++
++bool Model::registerFeature(std::string FeatureName, uint64_t FeatureID,
++                            int Index) {
++  auto Find1 = NameToID.find(FeatureName);
++  if (Find1 != NameToID.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in registerFeature: Feature " << FeatureName
++                      << " already exists\n");
++    return false;
++  }
++  NameToID.insert(std::make_pair(FeatureName, FeatureID));
++  IDToName.insert(std::make_pair(FeatureID, FeatureName));
++  auto Find2 = IDToIndex.find(FeatureID);
++  if (Find2 != IDToIndex.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in registerFeature: Feature with ID "
++                      << FeatureID << " already exists\n");
++    return false;
++  }
++  IDToIndex.insert(std::make_pair(FeatureID, Index));
++  return true;
++}
++
++bool Model::registerInput(std::string InputName, std::string InputType) {
++  auto Find = InputMap.find(InputName);
++  if (Find != InputMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in registerInput: Input " << InputName
++                      << " already exists\n");
++    return false;
++  }
++  InputMap.insert(std::make_pair(InputName, InputType));
++  return true;
++}
++
++bool Model::registerOutput(std::string OutputName, std::string OutputType) {
++  auto Find = OutputMap.find(OutputName);
++  if (Find != OutputMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in registerOutput: Output " << OutputName
++                      << " already exists\n");
++    return false;
++  }
++  OutputMap.insert(std::make_pair(OutputName, OutputType));
++  return true;
++}
++
++int Model::getIndex(uint64_t FeatureID) const {
++  auto Find = IDToIndex.find(FeatureID);
++  assert(Find != IDToIndex.end());
++  return Find->second;
++}
++
++int Model::getIndex(std::string FeatureName) const {
++  auto Find = NameToID.find(FeatureName);
++  assert(Find != NameToID.end());
++  uint64_t ID = Find->second;
++  return getIndex(ID);
++}
++
++std::string Model::getName(uint64_t FeatureID) const {
++  auto Find = IDToName.find(FeatureID);
++  assert(Find != IDToName.end());
++  return Find->second;
++}
++
++bool Model::checkOutputExists(std::string OutputName) const {
++  return (OutputMap.find(OutputName) != OutputMap.end());
++}
++
++std::string Model::getInputType(std::string InputName) const {
++  auto Find = InputMap.find(InputName);
++  assert(Find != InputMap.end());
++  return Find->second;
++}
++
++std::string Model::getOutputType(std::string OutputName) const {
++  auto Find = OutputMap.find(OutputName);
++  assert(Find != OutputMap.end());
++  return Find->second;
++}
++
++ACPOMLPythonInterface::ACPOMLPythonInterface() : NextID{0} {
++  std::optional<std::string> Env = llvm::sys::Process::GetEnv(ACPO_ENV_VAR_DIR);
++  if (!Env || *Env == "") {
++    std::optional<std::string> LLVMDIROpt =
++        llvm::sys::Process::GetEnv("LLVM_DIR");
++    if (LLVMDIROpt) {
++      Env = *LLVMDIROpt + "/acpo/";
++    } else {
++      return;
++    }
++  }
++
++  int32_t PID = (int32_t) llvm::sys::Process::getProcessId();
++  std::string ExecPython = "/usr/bin/python3";
++  std::string
++      PythonScript = *Env + "/" + std::string(ACPO_ML_PYTHON_INTERFACE_PY);
++  std::string PIDStr = std::to_string(PID);
++  std::string TimeStr = std::to_string(time(nullptr));
++  std::string NameOut =
++      *Env + "/" + ACPO_PIPE_PREFIX + "_CMD_" + PIDStr + "_" + TimeStr;
++  std::string NameIn =
++      *Env + "/" + ACPO_PIPE_PREFIX + "_RESP_" + PIDStr + "_" + TimeStr;
++  StringRef Args[] = { ExecPython, PythonScript, NameOut, NameIn };
++
++  // Start a process and don't wait for it to finish. We want it running in
++  // tandem.
++  std::string ErrMsg;
++  SubProcess =
++      sys::ExecuteNoWait(ExecPython, Args, std::nullopt, {}, 0, &ErrMsg);
++  if (!SubProcess.Pid) {
++    // Print out error message if the process fails to start.
++    LLVM_DEBUG(dbgs() << ErrMsg << "\n");
++    return;
++  }
++  // Yield to Python Process to set up pipes.
++  const int PythonProcessStartupLatency = 100;
++  usleep(PythonProcessStartupLatency);
++
++  // Now link to named pipes created by the process we just started. Note that
++  // because the creation of this file as a pipe was done elsewhere, the
++  // interface here is simple.
++
++  // First check that the response pipe has been created by attempting to open a
++  // file for reading. If this is not successful, then sleep for 100us to allow
++  // the ML interface the time to create named pipes and open the response pipe
++  // for writing. Once that is done, the fopen call will pass here.
++
++  // FIXME: Support library provides robust and portable APIs for opening files
++  // and creating input/output streams. Use them instead of calling libc
++  // functions.
++  PipeIn = fopen(NameIn.c_str(), "r");
++  if (PipeIn == nullptr) {
++    do {
++      usleep(100);
++      PipeIn = fopen(NameIn.c_str(), "r");
++    } while (PipeIn == nullptr);
++  }
++
++  // Once the response FIFO is created, then open the command FIFO for writing.
++  // This will complete the handshake with the MLInterface in Python.
++  PipeOut = fopen(NameOut.c_str(), "w");
++  // Now open named pipes to the new process.
++  setInitialized(true);
++}
++
++ACPOMLPythonInterface::~ACPOMLPythonInterface() {
++  if (SubProcess.Pid)
++    closeMLInterface();
++  if (PipeIn)
++    fclose(PipeIn);
++  if (PipeOut)
++    fclose(PipeOut);
++  if (SubProcess.Pid) {
++    // Wait for the MLInterface 3 seconds and kill it.
++    sys::Wait(SubProcess, 3) ;
++    SubProcess = sys::ProcessInfo{};
++  }
++  setInitialized(false);
++}
++
++uint64_t ACPOMLPythonInterface::assignID() {
++  NextID++;
++  return NextID - 1;
++}
++
++bool ACPOMLPythonInterface::loadModel(std::string ModelSpecFile) {
++  sendCommand("LoadModel " + ModelSpecFile);
++  std::string Response = getResponse();
++  std::vector<std::string> Tokens = tokenize(Response);
++  if (Tokens[0] != RESPONSE_MODEL_LOADED) {
++    return false;
++  }
++  if (Tokens[1] == RESPONSE_ALREADY_IN_DICT) {
++    LLVM_DEBUG(dbgs() << "loadModel: the model specified in " << ModelSpecFile
++                      << " has already been loaded\n");
++    return true;
++  }
++  std::string ModelName = Tokens[1];
++  int NumFeatures = std::stoi(Tokens[2]);
++  LLVM_DEBUG(dbgs() << "Registering features: " << NumFeatures << "\n");
++  registerModel(ModelName, NumFeatures);
++  auto ModelPtr = ModelMap.find(ModelName)->second;
++  std::string FeatureName = "";
++  for (int I = 0; I < NumFeatures; I++) {
++    FeatureName = Tokens[I + 3];
++    if (!registerFeature(ModelName, FeatureName, I)) {
++      return false;
++    }
++  }
++  int OutputStart = 3 + NumFeatures;
++  int NumOutputs = std::stoi(Tokens[OutputStart]);
++  ModelPtr->setNumOutputs(NumOutputs);
++  OutputStart++;
++  std::string OutputName;
++  std::string OutputType;
++  for (int I = 0; I < NumOutputs; I++) {
++    std::istringstream IS(Tokens[OutputStart + I]);
++    IS >> OutputName >> OutputType;
++    if (!registerOutput(ModelName, OutputName, OutputType)) {
++      return false;
++    }
++  }
++  std::string Signature = Tokens[OutputStart + NumOutputs];
++  ModelPtr->setSignature(Signature);
++  return true;
++}
++
++bool ACPOMLPythonInterface::registerModel(std::string ModelName,
++                                          int NumFeatures) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find != ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "registerModel: Model " << ModelName
++                      << " already exists\n");
++    return false;
++  }
++  std::shared_ptr<Model> NewModel = std::make_shared<Model>(NumFeatures);
++  ModelMap.insert(std::make_pair(ModelName, NewModel));
++  return true;
++}
++
++bool ACPOMLPythonInterface::registerModel(std::string ModelName,
++                                          int NumFeatures, int NumOutputs) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find != ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "registerModel: Model " << ModelName
++                      << " already exists\n");
++    return false;
++  }
++  std::shared_ptr<Model> NewModel =
++      std::make_shared<Model>(NumFeatures, NumOutputs);
++  ModelMap.insert(std::make_pair(ModelName, NewModel));
++  return true;
++}
++
++bool ACPOMLPythonInterface::registerFeature(std::string ModelName,
++                                            std::string FeatureName,
++                                            int Index) {
++  auto Find = ModelMap.find(ModelName);
++  assert(Find != ModelMap.end());
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in registerFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  uint64_t ID = assignID();
++  return Find->second->registerFeature(FeatureName, ID, Index);
++}
++
++bool ACPOMLPythonInterface::registerOutput(std::string ModelName,
++                                           std::string OutputName,
++                                           std::string OutputType) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in registerOutput: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  return Find->second->registerOutput(OutputName, OutputType);
++}
++
++int ACPOMLPythonInterface::getNumLoadedModels() { return ModelMap.size(); }
++
++bool ACPOMLPythonInterface::defineInputIR(std::string Filename) {
++  return false;
++}
++
++bool ACPOMLPythonInterface::setCustomFeature(std::string ModelName,
++                                             uint64_t FeatureID,
++                                             int FeatureValue) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureID);
++  sendCommand("SetCustomFeature " + std::to_string(Index) + " " +
++              std::to_string(FeatureValue));
++  std::string Response = getResponse();
++  return (Response.find(RESPONSE_FEATURE_SET) == 0);
++}
++
++bool ACPOMLPythonInterface::setCustomFeature(std::string ModelName,
++                                             uint64_t FeatureID,
++                                             int64_t FeatureValue) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureID);
++  sendCommand("SetCustomFeature " + std::to_string(Index) + " " +
++              std::to_string(FeatureValue));
++  std::string Response = getResponse();
++  return (Response.find(RESPONSE_FEATURE_SET) == 0);
++}
++
++bool ACPOMLPythonInterface::setCustomFeature(std::string ModelName,
++                                             uint64_t FeatureID,
++                                             double FeatureValue) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureID);
++  sendCommand("SetCustomFeature " + std::to_string(Index) + " " +
++              std::to_string(FeatureValue));
++  std::string Response = getResponse();
++  return (Response.find(RESPONSE_FEATURE_SET) == 0);
++}
++
++bool ACPOMLPythonInterface::setCustomFeature(std::string ModelName,
++                                             uint64_t FeatureID,
++                                             float FeatureValue) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureID);
++  sendCommand("SetCustomFeature " + std::to_string(Index) + " " +
++              std::to_string(FeatureValue));
++  std::string Response = getResponse();
++  return (Response.find(RESPONSE_FEATURE_SET) == 0);
++}
++
++bool ACPOMLPythonInterface::setCustomFeature(std::string ModelName,
++                                             uint64_t FeatureID,
++                                             bool FeatureValue) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureID);
++  std::string Command = "SetCustomFeature " + std::to_string(Index) + " ";
++  Command += FeatureValue ? "1" : "0";
++  sendCommand(Command);
++  std::string Response = getResponse();
++  return (Response.find(RESPONSE_FEATURE_SET) == 0);
++}
++
++bool ACPOMLPythonInterface::setCustomFeature(std::string ModelName,
++                                             std::string FeatureName,
++                                             int FeatureValue) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureName);
++  sendCommand("SetCustomFeature " + std::to_string(Index) + " " +
++              std::to_string(FeatureValue));
++  std::string Response = getResponse();
++  return (Response.find(RESPONSE_FEATURE_SET) == 0);
++}
++
++bool ACPOMLPythonInterface::setCustomFeature(std::string ModelName,
++                                             std::string FeatureName,
++                                             int64_t FeatureValue) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureName);
++  sendCommand("SetCustomFeature " + std::to_string(Index) + " " +
++              std::to_string(FeatureValue));
++  std::string Response = getResponse();
++  return (Response.find(RESPONSE_FEATURE_SET) == 0);
++}
++
++bool ACPOMLPythonInterface::setCustomFeature(std::string ModelName,
++                                             std::string FeatureName,
++                                             double FeatureValue) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureName);
++  sendCommand("SetCustomFeature " + std::to_string(Index) + " " +
++              std::to_string(FeatureValue));
++  std::string Response = getResponse();
++  return (Response.find(RESPONSE_FEATURE_SET) == 0);
++}
++
++bool ACPOMLPythonInterface::setCustomFeature(std::string ModelName,
++                                             std::string FeatureName,
++                                             float FeatureValue) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureName);
++  sendCommand("SetCustomFeature " + std::to_string(Index) + " " +
++              std::to_string(FeatureValue));
++  std::string Response = getResponse();
++  return (Response.find(RESPONSE_FEATURE_SET) == 0);
++}
++
++bool ACPOMLPythonInterface::setCustomFeature(std::string ModelName,
++                                             std::string FeatureName,
++                                             bool FeatureValue) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureName);
++  std::string Command = "SetCustomFeature " + std::to_string(Index) + " ";
++  Command += FeatureValue ? "1" : "0";
++  sendCommand(Command);
++  std::string Response = getResponse();
++  std::vector<std::string> Tokens = tokenize(Response);
++  return (Response.find(RESPONSE_FEATURE_SET) == 0);
++}
++
++bool ACPOMLPythonInterface::initializeFeatures(
++    std::string ModelName,
++    const std::vector<std::pair<uint64_t, std::string>> &FeatureValues) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in initializeFeatures: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  if (FeatureValues.size() > Find->second->getNumFeatures()) {
++    LLVM_DEBUG(dbgs() << "ERROR in initializeFeatures: Invalid features\n");
++    return false;
++  }
++  CurrentlyActiveModel = ModelName;
++  std::string Command = "InitializeFeatures " + ModelName;
++  for (const auto &Feature : FeatureValues) {
++    uint64_t FeatureID = Feature.first;
++    std::string FeatureValue = Feature.second;
++    int Index = Find->second->getIndex(FeatureID);
++    Command += " " + std::to_string(Index) + " " + FeatureValue;
++  }
++  sendCommand(Command);
++  std::string Response = getResponse();
++  return (Response.find(RESPONSE_FEATURES_INITIALIZED) == 0);
++}
++
++bool ACPOMLPythonInterface::initializeFeatures(
++    std::string ModelName,
++    const std::vector<std::pair<std::string, std::string>> &FeatureValues) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in initializeFeatures: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  if (FeatureValues.size() > Find->second->getNumFeatures()) {
++    LLVM_DEBUG(dbgs() << "ERROR in initializeFeatures: Invalid features\n");
++    return false;
++  }
++  CurrentlyActiveModel = ModelName;
++  std::string Command = "InitializeFeatures " + ModelName;
++  for (const auto &Feature : FeatureValues) {
++    std::string FeatureName = Feature.first;
++    std::string FeatureValue = Feature.second;
++    int Index = Find->second->getIndex(FeatureName);
++    Command += " " + std::to_string(Index) + " " + FeatureValue;
++  }
++  sendCommand(Command);
++  std::string Response = getResponse();
++  return (Response.find(RESPONSE_FEATURES_INITIALIZED) == 0);
++}
++
++bool ACPOMLPythonInterface::setCustomFeatures(
++    std::string ModelName,
++    const std::vector<std::pair<uint64_t, std::string>> &FeatureValues) {
++  if (ModelName != CurrentlyActiveModel) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeatures: Model " << ModelName
++                      << " has not been loaded or is not active\n");
++    return false;
++  }
++  auto Find = ModelMap.find(ModelName);
++  if (FeatureValues.size() > Find->second->getNumFeatures()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeatures: Invalid features\n");
++    return false;
++  }
++  std::string Command = "SetCustomFeatures";
++  for (const auto &Feature : FeatureValues) {
++    uint64_t FeatureID = Feature.first;
++    std::string FeatureValue = Feature.second;
++    int Index = Find->second->getIndex(FeatureID);
++    Command += " " + std::to_string(Index) + " " + FeatureValue;
++  }
++  sendCommand(Command);
++  std::string Response = getResponse();
++  return (Response.find(RESPONSE_FEATURES_SET) == 0);
++}
++
++bool ACPOMLPythonInterface::setCustomFeatures(
++    std::string ModelName,
++    const std::vector<std::pair<std::string, std::string>> &FeatureValues) {
++  if (ModelName != CurrentlyActiveModel) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeatures: Model " << ModelName
++                      << " has not been loaded or is not active\n");
++    return false;
++  }
++  auto Find = ModelMap.find(ModelName);
++  if (FeatureValues.size() > Find->second->getNumFeatures()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeatures: Invalid features\n");
++    return false;
++  }
++  std::string Command = "SetCustomFeatures";
++  for (const auto &Feature : FeatureValues) {
++    std::string FeatureName = Feature.first;
++    std::string FeatureValue = Feature.second;
++    int Index = Find->second->getIndex(FeatureName);
++    Command += " " + std::to_string(Index) + " " + FeatureValue;
++  }
++  sendCommand(Command);
++  std::string Response = getResponse();
++  return (Response.find(RESPONSE_FEATURES_SET) == 0);
++}
++
++bool ACPOMLPythonInterface::runModel(std::string ModelName) {
++  if (ModelName != CurrentlyActiveModel) {
++    LLVM_DEBUG(dbgs() << "ERROR in runModel: Model " << ModelName
++                      << " is not active\n");
++    return false;
++  }
++  sendCommand("RunModel");
++  std::string Response = getResponse();
++  return (Response.find(RESPONSE_COMPLETED) == 0);
++}
++
++std::string ACPOMLPythonInterface::getOutputType(std::string ModelName,
++                                                 std::string OutputName) {
++  auto Find = ModelMap.find(ModelName);
++  assert(Find != ModelMap.end());
++  return Find->second->getOutputType(OutputName);
++}
++
++int ACPOMLPythonInterface::getModelResultI(std::string OutputName) {
++  auto Find = ModelMap.find(CurrentlyActiveModel);
++  assert(Find->second->checkOutputExists(OutputName));
++  sendCommand("GetModelOutput " + OutputName);
++  std::string Response = getResponse();
++  std::vector<std::string> Tokens = tokenize(Response);
++  assert(Tokens.size() == 3);
++  assert(Tokens[0] == OutputName);
++  int Result = std::stoi(Tokens[2]);
++  return Result;
++}
++
++int64_t ACPOMLPythonInterface::getModelResultI64(std::string OutputName) {
++  auto Find = ModelMap.find(CurrentlyActiveModel);
++  assert(Find->second->checkOutputExists(OutputName));
++  sendCommand("GetModelOutput " + OutputName);
++  std::string Response = getResponse();
++  std::vector<std::string> Tokens = tokenize(Response);
++  assert(Tokens.size() == 3);
++  assert(Tokens[0] == OutputName);
++  int64_t Result = std::stol(Tokens[2]);
++  return Result;
++}
++
++float ACPOMLPythonInterface::getModelResultF(std::string OutputName) {
++  auto Find = ModelMap.find(CurrentlyActiveModel);
++  assert(Find->second->checkOutputExists(OutputName));
++  sendCommand("GetModelOutput " + OutputName);
++  std::string Response = getResponse();
++  std::vector<std::string> Tokens = tokenize(Response);
++  assert(Tokens.size() == 3);
++  assert(Tokens[0] == OutputName);
++  float Result = std::stof(Tokens[2]);
++  return Result;
++}
++
++double ACPOMLPythonInterface::getModelResultD(std::string OutputName) {
++  auto Find = ModelMap.find(CurrentlyActiveModel);
++  assert(Find->second->checkOutputExists(OutputName));
++  sendCommand("GetModelOutput " + OutputName);
++  std::string Response = getResponse();
++  std::vector<std::string> Tokens = tokenize(Response);
++  assert(Tokens.size() == 3);
++  assert(Tokens[0] == OutputName);
++  double Result = std::stod(Tokens[2]);
++  return Result;
++}
++
++bool ACPOMLPythonInterface::getModelResultB(std::string OutputName) {
++  auto Find = ModelMap.find(CurrentlyActiveModel);
++  assert(Find->second->checkOutputExists(OutputName));
++  sendCommand("GetModelOutput " + OutputName);
++  std::string Response = getResponse();
++  std::vector<std::string> Tokens = tokenize(Response);
++  assert(Tokens.size() == 3);
++  assert(Tokens[0] == OutputName);
++  return (Tokens[2] == "1");
++}
++
++int ACPOMLPythonInterface::getStatus() {
++  sendCommand("GetStatus");
++  std::string Response = getResponse();
++  return Response.find(RESPONSE_ACTIVE) == 0;
++}
++
++bool ACPOMLPythonInterface::releaseModel(std::string ModelName) {
++  sendCommand("ReleaseModel " + ModelName);
++  std::string Response = getResponse();
++  ModelMap.erase(ModelName);
++  CurrentlyActiveModel = "";
++  return true;
++}
++
++bool ACPOMLPythonInterface::closeMLInterface() {
++  sendCommand("CloseMLInterface");
++  std::string Response = getResponse();
++  return true;
++}
++
++void ACPOMLPythonInterface::sendCommand(const std::string &Command) {
++  fprintf(PipeOut,"%s\n", Command.c_str());
++  fflush(PipeOut);
++  usleep(1);
++}
++
++void ACPOMLPythonInterface::sendCommand(
++    const std::vector<std::string> &Features) {
++  for (auto I = Features.begin(); I != Features.end(); I++) {
++    fprintf(PipeOut,"%s\n", I->c_str());
++    fflush(PipeOut);
++    usleep(1);
++  }
++}
++
++std::string ACPOMLPythonInterface::getResponse() {
++  std::string Response = "";
++  char Letter = getc(PipeIn);
++  while (Letter != '\n') {
++    if (feof(PipeIn))
++      assert(false && "ACPO pipeline is closed unexpectively.");
++
++    Response += Letter;
++    Letter = getc(PipeIn);
++  }
++  Response += '\n';
++  if (Response.substr(0, 5) == RESPONSE_ERROR) {
++    LLVM_DEBUG(dbgs() << Response);
++    assert(false && "MLInterface reutrned error");
++  }
++  return Response;
++}
++
++std::vector<std::string>
++ACPOMLPythonInterface::tokenize(const std::string &Line) {
++  std::vector<std::string> Result;
++  std::string Temp = Line;
++  auto Loc = Temp.find(",");
++  while (Loc != std::string::npos) {
++    std::string Sub = Temp.substr(0, Loc);
++    Result.push_back(Sub);
++    Temp = Temp.substr(Loc + 1);
++    Loc = Temp.find(",");
++  }
++  if (Temp.length() > 0)
++    Result.push_back(Temp);
++
++  return Result;
++}
++
++std::shared_ptr<ACPOMLInterface> llvm::createPersistentPythonMLIF() {
++  if (PersistentMLIF == nullptr) {
++    PersistentMLIF = std::make_shared<ACPOMLPythonInterface>();
++
++    if (!PersistentMLIF->isInitialized())
++      PersistentMLIF = nullptr;
++  }
++  return PersistentMLIF;
++}
++
++ACPOMLCPPInterface::ACPOMLCPPInterface() { setInitialized(true); }
++
++ACPOMLCPPInterface::~ACPOMLCPPInterface() {}
++
++uint64_t ACPOMLCPPInterface::assignID() {
++  NextID++;
++  return NextID - 1;
++}
++
++bool ACPOMLCPPInterface::loadModel(std::string ModelSpecFile) {
++  std::string ModelName = readModelParam(ModelSpecFile, "ModelName");
++  // Check if the model is already in the dictionary
++  if (RunnerMap.find(ModelName) != RunnerMap.end()) {
++    LLVM_DEBUG(dbgs() << "loadModel: the compiled model '" << ModelName
++                      << "' has already been loaded\n");
++    return true;
++  }
++  std::vector<std::pair<std::string, std::string>> Features{};
++  readFeatures(ModelSpecFile, Features);
++  std::vector<std::pair<std::string, std::string>> Outputs{};
++  readOutputs(ModelSpecFile, Outputs);
++
++  LLVM_DEBUG(llvm::dbgs() << "Loading compiled model with name " << ModelName
++                          << "\n");
++
++  auto CreatorFunctionIterator = CreateModelRunnerMap.find(ModelName);
++  if (CreatorFunctionIterator == CreateModelRunnerMap.end()) {
++    LLVM_DEBUG(llvm::dbgs()
++               << ("Could not find compiled model class for model '" +
++                   ModelName + "'\n"));
++    return false;
++  }
++
++  auto CreatorFunction = CreatorFunctionIterator->second;
++
++  std::string OutputKey = readModelParam(ModelSpecFile, "OutputKey");
++  auto ModelRunner = CreatorFunction(Features, OutputKey);
++
++  registerModel(ModelName, Features.size());
++  RunnerMap.insert(std::make_pair(ModelName, std::move(ModelRunner)));
++  auto ModelPtr = ModelMap.find(ModelName)->second;
++  for (size_t I = 0; I < Features.size(); I++) {
++    if (!registerFeature(ModelName, Features[I].first, I)) {
++      return false;
++    }
++    if (!ModelPtr->registerInput(Features[I].first, Features[I].second)) {
++      return false;
++    }
++  }
++
++  ModelPtr->setNumOutputs(Outputs.size());
++  for (size_t I = 0; I < Outputs.size(); I++) {
++    if (!registerOutput(ModelName, Outputs[I].first, Outputs[I].second)) {
++      return false;
++    }
++  }
++
++  LLVM_DEBUG(llvm::dbgs() << "Model " << ModelName
++                          << " was successfully loaded\n");
++
++  // We do not need to set signature here because it is already given to make
++  // the precompiled model
++  return true;
++}
++
++bool ACPOMLCPPInterface::registerModel(std::string ModelName, int NumFeatures) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find != ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "registerModel: Model " << ModelName
++                      << " already exists\n");
++    return false;
++  }
++  std::shared_ptr<Model> NewModel = std::make_shared<Model>(NumFeatures);
++  ModelMap.insert(std::make_pair(ModelName, NewModel));
++  return true;
++}
++
++bool ACPOMLCPPInterface::registerModel(std::string ModelName, int NumFeatures,
++                                       int NumOutputs) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find != ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "registerModel: Model " << ModelName
++                      << " already exists\n");
++    return false;
++  }
++  std::shared_ptr<Model> NewModel =
++      std::make_shared<Model>(NumFeatures, NumOutputs);
++  ModelMap.insert(std::make_pair(ModelName, NewModel));
++  return true;
++}
++
++bool ACPOMLCPPInterface::registerFeature(std::string ModelName,
++                                         std::string FeatureName, int Index) {
++  auto Find = ModelMap.find(ModelName);
++  assert(Find != ModelMap.end());
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in registerFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  uint64_t ID = assignID();
++  return Find->second->registerFeature(FeatureName, ID, Index);
++}
++
++bool ACPOMLCPPInterface::registerOutput(std::string ModelName,
++                                        std::string OutputName,
++                                        std::string OutputType) {
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in registerOutput: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  return Find->second->registerOutput(OutputName, OutputType);
++}
++
++int ACPOMLCPPInterface::getNumLoadedModels() { return ModelMap.size(); }
++
++bool ACPOMLCPPInterface::defineInputIR(std::string Filename) { return false; }
++
++bool ACPOMLCPPInterface::setCustomFeature(std::string ModelName,
++                                          uint64_t FeatureID,
++                                          int FeatureValue) {
++  LLVM_DEBUG(
++      dbgs()
++      << "ACPOMLCPPInterface: setting custom feature of type int in model "
++      << ModelName << "\n");
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureID);
++  std::shared_ptr<llvm::ACPOModelRunner> Runner =
++      RunnerMap.find(ModelName)->second;
++  return Runner->setCustomFeature(Index, FeatureValue);
++}
++
++bool ACPOMLCPPInterface::setCustomFeature(std::string ModelName,
++                                          uint64_t FeatureID,
++                                          int64_t FeatureValue) {
++  LLVM_DEBUG(
++      dbgs()
++      << "ACPOMLCPPInterface: setting custom feature of type double in model "
++      << ModelName << "\n");
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureID);
++  std::shared_ptr<llvm::ACPOModelRunner> Runner =
++      RunnerMap.find(ModelName)->second;
++  return Runner->setCustomFeature(Index, FeatureValue);
++}
++
++bool ACPOMLCPPInterface::setCustomFeature(std::string ModelName,
++                                          uint64_t FeatureID,
++                                          double FeatureValue) {
++  LLVM_DEBUG(
++      dbgs()
++      << "ACPOMLCPPInterface: setting custom feature of type double in model "
++      << ModelName << "\n");
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureID);
++  std::shared_ptr<llvm::ACPOModelRunner> Runner =
++      RunnerMap.find(ModelName)->second;
++  return Runner->setCustomFeature(Index, FeatureValue);
++}
++
++bool ACPOMLCPPInterface::setCustomFeature(std::string ModelName,
++                                          uint64_t FeatureID,
++                                          float FeatureValue) {
++  LLVM_DEBUG(
++      dbgs()
++      << "ACPOMLCPPInterface: setting custom feature of type float in model "
++      << ModelName << "\n");
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureID);
++  std::shared_ptr<llvm::ACPOModelRunner> Runner =
++      RunnerMap.find(ModelName)->second;
++  return Runner->setCustomFeature(Index, FeatureValue);
++}
++
++bool ACPOMLCPPInterface::setCustomFeature(std::string ModelName,
++                                          uint64_t FeatureID,
++                                          bool FeatureValue) {
++  LLVM_DEBUG(
++      dbgs()
++      << "ACPOMLCPPInterface: setting custom feature of type bool in model "
++      << ModelName << "\n");
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureID);
++  std::shared_ptr<llvm::ACPOModelRunner> Runner =
++      RunnerMap.find(ModelName)->second;
++  return Runner->setCustomFeature(Index, FeatureValue);
++}
++
++bool ACPOMLCPPInterface::setCustomFeature(std::string ModelName,
++                                          std::string FeatureName,
++                                          int FeatureValue) {
++  LLVM_DEBUG(
++      dbgs()
++      << "ACPOMLCPPInterface: setting custom feature of type int in model "
++      << ModelName << "\n");
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureName);
++  std::shared_ptr<llvm::ACPOModelRunner> Runner =
++      RunnerMap.find(ModelName)->second;
++  return Runner->setCustomFeature(Index, FeatureValue);
++}
++
++bool ACPOMLCPPInterface::setCustomFeature(std::string ModelName,
++                                          std::string FeatureName,
++                                          int64_t FeatureValue) {
++  LLVM_DEBUG(
++      dbgs()
++      << "ACPOMLCPPInterface: setting custom feature of type int64 in model "
++      << ModelName << "\n");
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureName);
++  std::shared_ptr<llvm::ACPOModelRunner> Runner =
++      RunnerMap.find(ModelName)->second;
++  return Runner->setCustomFeature(Index, FeatureValue);
++}
++
++bool ACPOMLCPPInterface::setCustomFeature(std::string ModelName,
++                                          std::string FeatureName,
++                                          double FeatureValue) {
++  LLVM_DEBUG(
++      dbgs()
++      << "ACPOMLCPPInterface: setting custom feature of type double in model "
++      << ModelName << "\n");
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureName);
++  std::shared_ptr<llvm::ACPOModelRunner> Runner =
++      RunnerMap.find(ModelName)->second;
++  return Runner->setCustomFeature(Index, FeatureValue);
++}
++
++bool ACPOMLCPPInterface::setCustomFeature(std::string ModelName,
++                                          std::string FeatureName,
++                                          float FeatureValue) {
++  LLVM_DEBUG(
++      dbgs()
++      << "ACPOMLCPPInterface: setting custom feature of type float in model "
++      << ModelName << "\n");
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureName);
++  std::shared_ptr<llvm::ACPOModelRunner> Runner =
++      RunnerMap.find(ModelName)->second;
++  return Runner->setCustomFeature(Index, FeatureValue);
++}
++
++bool ACPOMLCPPInterface::setCustomFeature(std::string ModelName,
++                                          std::string FeatureName,
++                                          bool FeatureValue) {
++  LLVM_DEBUG(
++      dbgs()
++      << "ACPOMLCPPInterface: setting custom feature of type bool in model "
++      << ModelName << "\n");
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeature: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  int Index = Find->second->getIndex(FeatureName);
++  std::shared_ptr<llvm::ACPOModelRunner> Runner =
++      RunnerMap.find(ModelName)->second;
++  return Runner->setCustomFeature(Index, FeatureValue);
++}
++
++bool ACPOMLCPPInterface::initializeFeatures(
++    std::string ModelName,
++    const std::vector<std::pair<uint64_t, std::string>> &FeatureValues) {
++  LLVM_DEBUG(dbgs() << "Initializing features for model " << ModelName
++                    << " using feature IDs\n");
++  auto Find = ModelMap.find(ModelName);
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in initializeFeatures: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  if (FeatureValues.size() > Find->second->getNumFeatures()) {
++    LLVM_DEBUG(dbgs() << "ERROR in initializeFeatures: Invalid features\n");
++    return false;
++  }
++  CurrentlyActiveModel = ModelName;
++  for (const auto &Feature : FeatureValues) {
++    uint64_t FeatureID = Feature.first;
++    std::string FeatureValue = Feature.second;
++
++    std::string FeatureType =
++        getInputType(ModelName, Find->second->getName(FeatureID));
++    if (FeatureType == "int64") {
++      int64_t Value = std::stoi(FeatureValue);
++      setCustomFeature(ModelName, FeatureID, Value);
++    } else if (FeatureType == "int32") {
++      int32_t Value = std::stoi(FeatureValue);
++      setCustomFeature(ModelName, FeatureID, Value);
++    } else if (FeatureType == "int") {
++      int Value = std::stoi(FeatureValue);
++      setCustomFeature(ModelName, FeatureID, Value);
++    } else if (FeatureType == "float64") {
++      double Value = std::stod(FeatureValue);
++      setCustomFeature(ModelName, FeatureID, Value);
++    } else if (FeatureType == "float32") {
++      float Value = std::stof(FeatureValue);
++      setCustomFeature(ModelName, FeatureID, Value);
++    } else {
++      LLVM_DEBUG(dbgs() << "ERROR in initializeFeatures: Invalid feature type "
++                        << FeatureType << "\n");
++      return false;
++    }
++  }
++  return true;
++}
++
++bool ACPOMLCPPInterface::initializeFeatures(
++    std::string ModelName,
++    const std::vector<std::pair<std::string, std::string>> &FeatureValues) {
++  auto Find = ModelMap.find(ModelName);
++  LLVM_DEBUG(dbgs() << "Initializing features for model " << ModelName
++                    << " using feature names\n");
++  if (Find == ModelMap.end()) {
++    LLVM_DEBUG(dbgs() << "ERROR in initializeFeatures: Model " << ModelName
++                      << " has not been loaded\n");
++    return false;
++  }
++  if (FeatureValues.size() > Find->second->getNumFeatures()) {
++    LLVM_DEBUG(dbgs() << "ERROR in initializeFeatures: Invalid features\n");
++    return false;
++  }
++  CurrentlyActiveModel = ModelName;
++  for (const auto &Feature : FeatureValues) {
++    std::string FeatureName = Feature.first;
++    std::string FeatureValue = Feature.second;
++
++    std::string FeatureType = getInputType(ModelName, FeatureName);
++    if (FeatureType == "int64") {
++      int64_t Value = std::stol(FeatureValue);
++      setCustomFeature(ModelName, FeatureName, Value);
++    } else if (FeatureType == "int32") {
++      int32_t Value = std::stoi(FeatureValue);
++      setCustomFeature(ModelName, FeatureName, Value);
++    } else if (FeatureType == "int") {
++      int Value = std::stoi(FeatureValue);
++      setCustomFeature(ModelName, FeatureName, Value);
++    } else if (FeatureType == "float64") {
++      double Value = std::stod(FeatureValue);
++      setCustomFeature(ModelName, FeatureName, Value);
++    } else if (FeatureType == "float32") {
++      float Value = std::stof(FeatureValue);
++      setCustomFeature(ModelName, FeatureName, Value);
++    } else {
++      LLVM_DEBUG(dbgs() << "ERROR in initializeFeatures: Invalid feature type "
++                        << FeatureType << "\n");
++      return false;
++    }
++  }
++  return true;
++}
++
++bool ACPOMLCPPInterface::setCustomFeatures(
++    std::string ModelName,
++    const std::vector<std::pair<uint64_t, std::string>> &FeatureValues) {
++  if (ModelName != CurrentlyActiveModel) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeatures: Model " << ModelName
++                      << " has not been loaded or is not active\n");
++    return false;
++  }
++  auto Find = ModelMap.find(ModelName);
++  if (FeatureValues.size() > Find->second->getNumFeatures()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeatures: Invalid features\n");
++    return false;
++  }
++  std::string Command = "SetCustomFeatures";
++  for (const auto &Feature : FeatureValues) {
++    uint64_t FeatureID = Feature.first;
++    std::string FeatureValue = Feature.second;
++
++    std::string FeatureType =
++        getInputType(ModelName, Find->second->getName(FeatureID));
++    if (FeatureType == "int64") {
++      int64_t Value = std::stol(FeatureValue);
++      setCustomFeature(ModelName, FeatureID, Value);
++    } else if (FeatureType == "int32") {
++      int32_t Value = std::stoi(FeatureValue);
++      setCustomFeature(ModelName, FeatureID, Value);
++    } else if (FeatureType == "int") {
++      int Value = std::stoi(FeatureValue);
++      setCustomFeature(ModelName, FeatureID, Value);
++    } else if (FeatureType == "float64") {
++      double Value = std::stod(FeatureValue);
++      setCustomFeature(ModelName, FeatureID, Value);
++    } else if (FeatureType == "float32") {
++      float Value = std::stof(FeatureValue);
++      setCustomFeature(ModelName, FeatureID, Value);
++    } else {
++      LLVM_DEBUG(dbgs() << "ERROR in setCustomFeatures: Invalid feature type "
++                        << FeatureType << "\n");
++      return false;
++    }
++  }
++  return true;
++}
++
++bool ACPOMLCPPInterface::setCustomFeatures(
++    std::string ModelName,
++    const std::vector<std::pair<std::string, std::string>> &FeatureValues) {
++  if (ModelName != CurrentlyActiveModel) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeatures: Model " << ModelName
++                      << " has not been loaded or is not active\n");
++    return false;
++  }
++  auto Find = ModelMap.find(ModelName);
++  if (FeatureValues.size() > Find->second->getNumFeatures()) {
++    LLVM_DEBUG(dbgs() << "ERROR in setCustomFeatures: Invalid features\n");
++    return false;
++  }
++  std::string Command = "SetCustomFeatures";
++  for (const auto &Feature : FeatureValues) {
++    std::string FeatureName = Feature.first;
++    std::string FeatureValueStr = Feature.second;
++
++    std::string FeatureType = getInputType(ModelName, FeatureName);
++    if (FeatureType == "int64") {
++      int64_t FeatureValue = std::stoi(FeatureValueStr);
++      setCustomFeature(ModelName, FeatureName, FeatureValue);
++    } else if (FeatureType == "int32") {
++      int32_t FeatureValue = std::stoi(FeatureValueStr);
++      setCustomFeature(ModelName, FeatureName, FeatureValue);
++    } else if (FeatureType == "int") {
++      int FeatureValue = std::stoi(FeatureValueStr);
++      setCustomFeature(ModelName, FeatureName, FeatureValue);
++    } else if (FeatureType == "float64") {
++      double FeatureValue = std::stod(FeatureValueStr);
++      setCustomFeature(ModelName, FeatureName, FeatureValue);
++    } else if (FeatureType == "float32") {
++      float FeatureValue = std::stof(FeatureValueStr);
++      setCustomFeature(ModelName, FeatureName, FeatureValue);
++    } else {
++      LLVM_DEBUG(dbgs() << "ERROR in setCustomFeatures: Invalid feature type "
++                        << FeatureType << "\n");
++      return false;
++    }
++  }
++  return true;
++}
++
++bool ACPOMLCPPInterface::runModel(std::string ModelName) {
++  if (ModelName != CurrentlyActiveModel) {
++    LLVM_DEBUG(dbgs() << "ERROR in runModel: Model " << ModelName
++                      << " is not active\n");
++    return false;
++  }
++  std::shared_ptr<llvm::ACPOModelRunner> Runner =
++      RunnerMap.find(CurrentlyActiveModel)->second;
++  return Runner->runModel();
++}
++
++std::string ACPOMLCPPInterface::getInputType(std::string ModelName,
++                                             std::string InputName) {
++  auto Find = ModelMap.find(ModelName);
++  assert(Find != ModelMap.end());
++  return Find->second->getInputType(InputName);
++}
++
++std::string ACPOMLCPPInterface::getOutputType(std::string ModelName,
++                                              std::string OutputName) {
++  auto Find = ModelMap.find(ModelName);
++  assert(Find != ModelMap.end());
++  return Find->second->getOutputType(OutputName);
++}
++
++int ACPOMLCPPInterface::getModelResultI(std::string OutputName) {
++  std::shared_ptr<llvm::ACPOModelRunner> Runner =
++      RunnerMap.find(CurrentlyActiveModel)->second;
++  return Runner->getModelResultI(OutputName);
++}
++
++int64_t ACPOMLCPPInterface::getModelResultI64(std::string OutputName) {
++  std::shared_ptr<llvm::ACPOModelRunner> Runner =
++      RunnerMap.find(CurrentlyActiveModel)->second;
++  return Runner->getModelResultI64(OutputName);
++}
++
++float ACPOMLCPPInterface::getModelResultF(std::string OutputName) {
++  std::shared_ptr<llvm::ACPOModelRunner> Runner =
++      RunnerMap.find(CurrentlyActiveModel)->second;
++  return Runner->getModelResultF(OutputName);
++}
++
++double ACPOMLCPPInterface::getModelResultD(std::string OutputName) {
++  std::shared_ptr<llvm::ACPOModelRunner> Runner =
++      RunnerMap.find(CurrentlyActiveModel)->second;
++  return Runner->getModelResultD(OutputName);
++}
++
++bool ACPOMLCPPInterface::getModelResultB(std::string OutputName) {
++  std::shared_ptr<llvm::ACPOModelRunner> Runner =
++      RunnerMap.find(CurrentlyActiveModel)->second;
++  return Runner->getModelResultB(OutputName);
++}
++
++int ACPOMLCPPInterface::getStatus() { return 1; }
++
++bool ACPOMLCPPInterface::releaseModel(std::string ModelName) {
++  ModelMap.erase(ModelName);
++  RunnerMap.erase(ModelName);
++  CurrentlyActiveModel = "";
++  return true;
++}
++
++bool ACPOMLCPPInterface::closeMLInterface() { return true; }
++
++std::string ACPOMLCPPInterface::readModelParam(std::string FilePath,
++                                               std::string Param) {
++  std::optional<std::string> Env = llvm::sys::Process::GetEnv(ACPO_ENV_VAR_DIR);
++  if (!Env || *Env == "") {
++    std::optional<std::string> LLVMDIROpt =
++        llvm::sys::Process::GetEnv("LLVM_DIR");
++    if (LLVMDIROpt) {
++      Env = *LLVMDIROpt + "/acpo/";
++    } else {
++      return "";
++    }
++  }
++
++  FilePath = *Env + "/" + FilePath;
++
++  std::ifstream FileStream{FilePath};
++
++  std::string Line;
++  while (std::getline(FileStream, Line)) {
++    if (Line.rfind(Param, 0) == 0) {
++      return Line.substr(Param.size() + 1);
++    }
++  }
++  return "";
++}
++
++void ACPOMLCPPInterface::readFeatures(
++    std::string FilePath,
++    std::vector<std::pair<std::string, std::string>> &Features) {
++  std::string Line = readModelParam(FilePath, "Features");
++  while (!Line.empty()) {
++    // This reads the features, assuming each feature is written as
++    // {feature_name, feature_type}
++    size_t LeftBracket = Line.find("{");
++    size_t Comma = Line.find(",", LeftBracket);
++    size_t Space = Line.find(" ", Comma);
++    size_t RightBracket = Line.find("}", Space);
++    if (LeftBracket == Line.size() || Comma == Line.size() ||
++        Space == Line.size() || RightBracket == Line.size()) {
++      break;
++    }
++    std::string Feature = Line.substr(LeftBracket + 1, Comma - LeftBracket - 1);
++    std::string Type = Line.substr(Space + 1, RightBracket - Space - 1);
++
++    Features.emplace_back(std::make_pair(Feature, Type));
++    int oldLength = Line.size();
++    Line = Line.substr(RightBracket + 1);
++    int newLength = Line.size();
++    if (oldLength == newLength)
++      break;
++  }
++}
++
++void ACPOMLCPPInterface::readOutputs(
++    std::string FilePath,
++    std::vector<std::pair<std::string, std::string>> &Outputs) {
++  std::string Line = readModelParam(FilePath, "Outputs");
++  while (!Line.empty()) {
++    // This reads the features, assuming each feature is written as
++    // {feature_name, feature_type}
++    size_t LeftBracket = Line.find("{");
++    size_t Comma = Line.find(",", LeftBracket);
++    size_t Space = Line.find(" ", Comma);
++    size_t RightBracket = Line.find("}", Space);
++    if (LeftBracket == Line.size() || Comma == Line.size() ||
++        Space == Line.size() || RightBracket == Line.size()) {
++      break;
++    }
++    std::string Output = Line.substr(LeftBracket + 1, Comma - LeftBracket - 1);
++    std::string Type = Line.substr(Space + 1, RightBracket - Space - 1);
++
++    Outputs.emplace_back(std::make_pair(Output, Type));
++    int oldLength = Line.size();
++    Line = Line.substr(RightBracket + 1);
++    int newLength = Line.size();
++    if (oldLength == newLength)
++      break;
++  }
++}
++
++std::shared_ptr<ACPOMLInterface> llvm::createPersistentCompiledMLIF() {
++  if (PersistentMLIF == nullptr) {
++    PersistentMLIF = std::make_shared<ACPOMLCPPInterface>();
++    if (!PersistentMLIF->isInitialized())
++      PersistentMLIF = nullptr;
++  }
++  return PersistentMLIF;
++}
++
++#ifdef LLVM_HAVE_TF_AOT_FICOMPILEDMODEL
++std::unique_ptr<ACPOModelRunner>
++createFI(std::vector<std::pair<std::string, std::string>> Inputs,
++         StringRef Decision) {
++  // Context does not ever seem to be used in the model runner,
++  // so for now just create an empty context object
++  LLVMContext Ctx;
++  return std::make_unique<FIModelRunner>(Ctx, Inputs, Decision);
++}
++#endif
++
++// Generate map using ifdefs for now, in the future we could have this
++// automatically populate using macros
++const std::unordered_map<std::string,
++                         ACPOMLCPPInterface::CreateModelRunnerFunction>
++    ACPOMLCPPInterface::CreateModelRunnerMap = {
++#ifdef LLVM_HAVE_TF_AOT_FICOMPILEDMODEL
++        {"FI", createFI},
++#endif
++};
+diff --git a/llvm/lib/Analysis/ACPOModel.cpp b/llvm/lib/Analysis/ACPOModel.cpp
+new file mode 100644
+index 000000000000..2d0dae733943
+--- /dev/null
++++ b/llvm/lib/Analysis/ACPOModel.cpp
+@@ -0,0 +1,63 @@
++//===- ACPOModel.cpp - AI-Enabled Continuous Program Optimization ---------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++//
++// This file implements the interface between ACPO and ML-guided optimizations.
++// It delegates decision making to inference with a pre-trained model.
++//
++//===----------------------------------------------------------------------===//
++
++#include "llvm/Analysis/ACPOModel.h"
++#include "llvm/Analysis/LoopInfo.h"
++#include "llvm/Analysis/OptimizationRemarkEmitter.h"
++#include "llvm/Support/Debug.h"
++#include <memory>
++
++using namespace llvm;
++
++#define DEBUG_TYPE "acpo"
++
++ACPOAdvice::ACPOAdvice(std::unique_ptr<ACPOAdvice> &ResultFormat) {
++  assert(ResultFormat != nullptr);
++  for (auto &Entry : ResultFormat->getFieldMap()) {
++    reserveField(Entry.first, Entry.second.T);
++  }
++}
++
++void ACPOModel::prepareModelInput() {}
++
++bool ACPOModel::runModel(std::unique_ptr<ACPOAdvice> &Result) { return true; }
++
++void ACPOModel::addRequiredResultField(std::string name, Type::TypeID &ID) {
++  ResultFormat->reserveField(name, ID);
++}
++
++std::unique_ptr<ACPOAdvice> ACPOModel::getAdvice() {
++  if (ShouldUseML)
++    return getAdviceML();
++  else
++    return getAdviceNoML();
++}
++
++std::unique_ptr<ACPOAdvice> ACPOModel::getAdviceML() {
++  // This needs to be filled with a mechanism to invoke a model selected
++  // using the ModelRunner.
++  sendCustomFeatures();
++  prepareModelInput();
++  std::unique_ptr<ACPOAdvice> Result =
++      std::make_unique<ACPOAdvice>(ResultFormat);
++
++  if (runModel(Result))
++    return Result;
++  else
++    return nullptr;
++}
++
++void ACPOModel::addFeature(int64_t ID, Constant *Val) {
++  assert(CustomFeatureMap.find(ID) == CustomFeatureMap.end());
++  CustomFeatureMap[ID] = Val;
++}
+diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
+index 9c6a70f0221f..961b5037dd48 100644
+--- a/llvm/lib/Analysis/CMakeLists.txt
++++ b/llvm/lib/Analysis/CMakeLists.txt
+@@ -4,6 +4,30 @@ if (DEFINED LLVM_HAVE_TF_AOT OR LLVM_HAVE_TFLITE)
+ 
+   set(LLVM_INLINER_MODEL_CURRENT_URL "<UNSPECIFIED>" CACHE STRING "URL to download the LLVM inliner model")
+ 
++  if (ACPO_AOT)
++    foreach (model_name model_path model_signature IN ZIP_LISTS LLVM_ACPO_MODEL_NAMES LLVM_ACPO_MODEL_PATHS LLVM_ACPO_MODEL_SIGNATURES)
++      set(fname ${model_name}CompiledModel)
++      string(TOUPPER ${fname} fname_allcaps)
++      if (LLVM_ACPO_OVERRIDE)
++        string(TOUPPER ${LLVM_ACPO_OVERRIDE_ARCH} arch_allcaps)
++        set(LLVM_OVERRIDE_MODEL_HEADER_${fname_allcaps}
++          ${LLVM_ACPO_OVERRIDE_PATH}/${fname}-${arch_allcaps}.h)
++        set(LLVM_OVERRIDE_MODEL_OBJECT_${fname_allcaps}
++          ${LLVM_ACPO_OVERRIDE_PATH}/${fname}-${arch_allcaps}.o)
++      endif()
++
++      tf_find_and_compile(
++        ${model_path}
++        ${LLVM_INLINER_MODEL_CURRENT_URL}
++        ${LLVM_INLINER_MODEL_PATH_DEFAULT}
++        ""
++        serve
++        "${model_signature}"
++        "${fname}"
++        "llvm::${fname}"
++      )
++    endforeach()
++  endif()
+   if (DEFINED LLVM_HAVE_TF_AOT)
+     tf_find_and_compile(
+       ${LLVM_INLINER_MODEL_PATH}
+@@ -24,6 +48,10 @@ if (DEFINED LLVM_HAVE_TF_AOT OR LLVM_HAVE_TFLITE)
+ endif()
+ 
+ add_llvm_component_library(LLVMAnalysis
++  ACPOCollectFeatures.cpp
++  ACPOFIModel.cpp
++  ACPOMLInterface.cpp
++  ACPOModel.cpp
+   AliasAnalysis.cpp
+   AliasAnalysisEvaluator.cpp
+   AliasSetTracker.cpp
+@@ -41,6 +69,7 @@ add_llvm_component_library(LLVMAnalysis
+   CGSCCPassManager.cpp
+   CallGraph.cpp
+   CallGraphSCCPass.cpp
++  CallHeight.cpp
+   CallPrinter.cpp
+   CaptureTracking.cpp
+   CmpInstAnalysis.cpp
+@@ -59,6 +88,8 @@ add_llvm_component_library(LLVMAnalysis
+   DomPrinter.cpp
+   DomTreeUpdater.cpp
+   DominanceFrontier.cpp
++  DumpCallsite.cpp
++  DumpFeature.cpp
+   FunctionPropertiesAnalysis.cpp
+   GlobalsModRef.cpp
+   GuardUtils.cpp
+@@ -100,6 +131,7 @@ add_llvm_component_library(LLVMAnalysis
+   MemoryProfileInfo.cpp
+   MemorySSA.cpp
+   MemorySSAUpdater.cpp
++  ModelDataCollector.cpp
+   ModelUnderTrainingRunner.cpp
+   ModuleDebugInfoPrinter.cpp
+   ModuleSummaryAnalysis.cpp
+diff --git a/llvm/lib/Analysis/CallHeight.cpp b/llvm/lib/Analysis/CallHeight.cpp
+new file mode 100644
+index 000000000000..f7b88cbdff05
+--- /dev/null
++++ b/llvm/lib/Analysis/CallHeight.cpp
+@@ -0,0 +1,89 @@
++//===- CallHeight.cpp - CallHeight implementation  ------------------------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++//
++// This file implements getting the call height of functions in a module.
++//
++//===----------------------------------------------------------------------===//
++#include "llvm/Analysis/CallHeight.h"
++#include "llvm/ADT/SCCIterator.h"
++#include "llvm/Analysis/CallGraph.h"
++#include "llvm/IR/InstIterator.h"
++#include "llvm/InitializePasses.h"
++
++using namespace llvm;
++
++#define DEBUG_TYPE "call-height"
++
++// Adapted from MLInlineAdvisor.cpp
++CallBase *getInlinableCallSite(Instruction &I) {
++  if (auto *CS = dyn_cast<CallBase>(&I)) {
++    if (Function *Callee = CS->getCalledFunction())
++      if (!Callee->isDeclaration()) {
++        return CS;
++      }
++  }
++  return nullptr;
++}
++
++unsigned CallHeight::getLevel(Function &F) { return (*Levels)[&F]; }
++
++CallHeight::CallHeight(Module &M)
++    : Levels(std::make_unique<std::map<const Function *, unsigned>>()) {
++  // Adapted from MLInlineAdvisor.cpp
++  CallGraph CG = CallGraph(M);
++
++  for (auto I = scc_begin(&CG); !I.isAtEnd(); ++I) {
++    const std::vector<CallGraphNode *> &CGNodes = *I;
++    unsigned Level = 0;
++    for (auto *CGNode : CGNodes) {
++      Function *F = CGNode->getFunction();
++      if (!F || F->isDeclaration())
++        continue;
++      for (auto &I : instructions(F)) {
++        if (auto *CS = getInlinableCallSite(I)) {
++          auto *Called = CS->getCalledFunction();
++          auto Pos = Levels->find(Called);
++          // In bottom up traversal, an inlinable callee is either in the
++          // same SCC, or to a function in a visited SCC. So not finding its
++          // level means we haven't visited it yet, meaning it's in this SCC.
++          if (Pos == Levels->end())
++            continue;
++          Level = std::max(Level, Pos->second + 1);
++        }
++      }
++    }
++    for (auto *CGNode : CGNodes) {
++      Function *F = CGNode->getFunction();
++      if (F && !F->isDeclaration())
++        (*Levels)[F] = Level;
++    }
++  }
++}
++
++AnalysisKey CallHeightAnalysis::Key;
++
++CallHeight CallHeightAnalysis::run(Module &M, ModuleAnalysisManager &MAM) {
++  return CallHeight(M);
++}
++
++bool CallHeightAnalysisWrapper::runOnModule(Module &M) {
++  Result.reset(new CallHeight(M));
++  return false;
++}
++
++void CallHeightAnalysisWrapper::getAnalysisUsage(AnalysisUsage &AU) const {
++  AU.setPreservesAll();
++}
++
++char CallHeightAnalysisWrapper::ID = 0;
++INITIALIZE_PASS(CallHeightAnalysisWrapper, DEBUG_TYPE, "Call Height Analysis",
++                false, true)
++
++Pass *llvm::createCallHeightAnalysisWrapper() {
++  return new CallHeightAnalysisWrapper();
++}
+diff --git a/llvm/lib/Analysis/DumpCallsite.cpp b/llvm/lib/Analysis/DumpCallsite.cpp
+new file mode 100644
+index 000000000000..d49885a372f2
+--- /dev/null
++++ b/llvm/lib/Analysis/DumpCallsite.cpp
+@@ -0,0 +1,82 @@
++//===- DumpCallsite.cpp - DumpCallsite implementation  --------------------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++//
++// This file implements the ability to dump all callsites in a given function.
++//
++//===----------------------------------------------------------------------===//
++#include "llvm/Analysis/DumpCallsite.h"
++#include "llvm/IR/Function.h"
++#include "llvm/IR/InstIterator.h"
++#include "llvm/IR/Instructions.h"
++#include "llvm/IR/LegacyPassManager.h"
++#include "llvm/InitializePasses.h"
++#include "llvm/Pass.h"
++#include "llvm/Support/CommandLine.h"
++
++using namespace llvm;
++
++static cl::opt<bool>
++    IncludeDeclaration("include-declaration", cl::Hidden,
++                       cl::desc("Also dump declaration in dump-callsite pass"));
++
++namespace {
++
++// Implementation of actual DumpCallsite
++class DumpCallsite {
++public:
++  void run(Function &F);
++};
++
++// Wrapper for legacy PM
++class DumpCallsiteLegacy : public FunctionPass {
++public:
++  static char ID;
++  DumpCallsiteLegacy() : FunctionPass(ID) {}
++
++  bool runOnFunction(Function &F) override;
++};
++
++void DumpCallsite::run(Function &F) {
++  outs() << F.getName();
++  // Get all callees from 'call' inst
++  for (auto &I : instructions(F)) {
++    // Is a call inst
++    if (auto *CS = dyn_cast<CallBase>(&I)) {
++      // callee is present
++      if (Function *Callee = CS->getCalledFunction()) {
++        // Not intrinsic
++        if (!Callee->isIntrinsic()) {
++          // decide whether to dump declaration
++          if (!Callee->isDeclaration() || IncludeDeclaration) {
++            outs() << " " << Callee->getName();
++          }
++        }
++      }
++    }
++  }
++  outs() << "\n";
++}
++
++bool DumpCallsiteLegacy::runOnFunction(Function &F) {
++  DumpCallsite Impl;
++  Impl.run(F);
++  return false;
++}
++
++} // namespace
++
++char DumpCallsiteLegacy::ID = 0;
++INITIALIZE_PASS(DumpCallsiteLegacy, "dump-callsite", "Dump Callsite", false,
++                false)
++
++PreservedAnalyses DumpCallsitePass::run(Function &F,
++                                        FunctionAnalysisManager &FAM) {
++  DumpCallsite Impl;
++  Impl.run(F);
++  return PreservedAnalyses::all();
++}
+diff --git a/llvm/lib/Analysis/DumpFeature.cpp b/llvm/lib/Analysis/DumpFeature.cpp
+new file mode 100644
+index 000000000000..81756226c2fd
+--- /dev/null
++++ b/llvm/lib/Analysis/DumpFeature.cpp
+@@ -0,0 +1,575 @@
++//===- DumpFeature.cpp - DumpFeature implementation -----------------------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++//
++// This file implements dumping features for functions in an scc.
++//
++//===----------------------------------------------------------------------===//
++#include "llvm/Analysis/DumpFeature.h"
++#include "llvm/ADT/SCCIterator.h"
++#include "llvm/Analysis/CallHeight.h"
++#include "llvm/Analysis/TargetLibraryInfo.h"
++#include "llvm/IR/BasicBlock.h"
++#include "llvm/IR/Function.h"
++#include "llvm/IR/Instructions.h"
++#include "llvm/IR/PassManager.h"
++#include "llvm/InitializePasses.h"
++#include "llvm/MC/MCAsmLayout.h"
++#include "llvm/Support/Casting.h"
++#include "llvm/Support/CommandLine.h"
++#include "llvm/Support/FileSystem.h"
++#include "llvm/Support/raw_ostream.h"
++
++#include <algorithm>
++#include <deque>
++#include <vector>
++
++using namespace llvm;
++
++bool EnableFeatureDump;
++static cl::opt<bool, true> EnableFeatureDumpFlag(
++    "enable-feature-dump", cl::location(EnableFeatureDump), cl::init(false),
++    cl::Hidden, cl::ZeroOrMore, cl::desc("Enable Feature Dump"));
++
++static cl::opt<bool>
++    CheckPairHisto("check-pair-histo", cl::Hidden,
++                   cl::desc("Dump instruction pairs in the histogram"));
++
++static cl::opt<bool> Verbose("dump-verbose", cl::Hidden,
++                             cl::desc("Dump as human readable format"));
++
++static llvm::cl::opt<std::string>
++    OutFile("feature-output", llvm::cl::desc("File for outputting features"),
++            llvm::cl::init("features.csv"));
++
++namespace {
++unsigned getMaxInstructionID() {
++#define LAST_OTHER_INST(NR) return NR;
++#include "llvm/IR/Instruction.def"
++}
++
++// This is a point in time - we determined including these pairs of
++// consecutive instructions (in the IR layout available at inline time) as
++// features improves the model performance. We want to move away from manual
++// feature selection.
++// The array is given in opcode pairs rather than labels because 1) labels
++// weren't readily available, and 2) the successions were hand - extracted.
++//
++// This array must be sorted.
++static const std::array<std::pair<size_t, size_t>, 137>
++    ImportantInstructionSuccessions{
++        {{1, 1},   {1, 4},   {1, 5},   {1, 7},   {1, 8},   {1, 9},   {1, 11},
++         {1, 12},  {1, 13},  {1, 14},  {1, 18},  {1, 20},  {1, 22},  {1, 24},
++         {1, 25},  {1, 26},  {1, 27},  {1, 28},  {1, 29},  {1, 30},  {1, 31},
++         {1, 32},  {1, 33},  {1, 34},  {1, 39},  {1, 40},  {1, 42},  {1, 45},
++         {2, 1},   {2, 2},   {2, 13},  {2, 28},  {2, 29},  {2, 32},  {2, 33},
++         {2, 34},  {2, 38},  {2, 48},  {2, 49},  {2, 53},  {2, 55},  {2, 56},
++         {13, 2},  {13, 13}, {13, 26}, {13, 33}, {13, 34}, {13, 56}, {15, 27},
++         {28, 2},  {28, 48}, {28, 53}, {29, 2},  {29, 33}, {29, 56}, {31, 31},
++         {31, 33}, {31, 34}, {31, 49}, {32, 1},  {32, 2},  {32, 13}, {32, 15},
++         {32, 28}, {32, 29}, {32, 32}, {32, 33}, {32, 34}, {32, 39}, {32, 40},
++         {32, 48}, {32, 49}, {32, 53}, {32, 56}, {33, 1},  {33, 2},  {33, 32},
++         {33, 33}, {33, 34}, {33, 49}, {33, 53}, {33, 56}, {34, 1},  {34, 2},
++         {34, 32}, {34, 33}, {34, 34}, {34, 49}, {34, 53}, {34, 56}, {38, 34},
++         {39, 57}, {40, 34}, {47, 15}, {47, 49}, {48, 2},  {48, 34}, {48, 56},
++         {49, 1},  {49, 2},  {49, 28}, {49, 32}, {49, 33}, {49, 34}, {49, 39},
++         {49, 49}, {49, 56}, {53, 1},  {53, 2},  {53, 28}, {53, 34}, {53, 53},
++         {53, 57}, {55, 1},  {55, 28}, {55, 34}, {55, 53}, {55, 55}, {55, 56},
++         {56, 1},  {56, 2},  {56, 7},  {56, 13}, {56, 32}, {56, 33}, {56, 34},
++         {56, 49}, {56, 53}, {56, 56}, {56, 64}, {57, 34}, {57, 56}, {57, 57},
++         {64, 1},  {64, 64}, {65, 1},  {65, 65}}};
++
++size_t getSize(Function &F, TargetTransformInfo &TTI) {
++  size_t SumOfAllInstCost = 0;
++  for (const auto &BB : F)
++    for (const auto &I : BB) {
++      std::optional<long int> cost =
++          TTI.getInstructionCost(
++                 &I, TargetTransformInfo::TargetCostKind::TCK_CodeSize)
++              .getValue();
++      if (cost.has_value())
++        SumOfAllInstCost += cost.value();
++    }
++  return SumOfAllInstCost;
++}
++
++unsigned getMaxDominatorTreeDepth(const Function &F,
++                                  const DominatorTree &Tree) {
++  unsigned MaxBBDepth = 0;
++  for (const auto &BB : F)
++    if (const auto *TN = Tree.getNode(&BB))
++      MaxBBDepth = std::max(MaxBBDepth, TN->getLevel());
++
++  return MaxBBDepth;
++}
++
++// get valid call uses and valid call uses in loop counts.
++std::pair<int, int>
++getValidCallUsesAndInLoopCounts(Function &F,
++                                FunctionAnalysisManager *FAM = nullptr) {
++  unsigned CallUses = 0;
++  unsigned CallUsesInLoop = 0;
++
++  for (User *U : F.users()) {
++    if (CallBase *CB = dyn_cast<CallBase>(U)) {
++      ++CallUses;
++      BasicBlock *BB = CB->getParent();
++      Function *FUser = CB->getCaller();
++      auto &LI =
++          FAM->getResult<LoopAnalysis>(*FUser);
++      if (LI.getLoopFor(BB) != nullptr) {
++        ++CallUsesInLoop;
++      }
++    }
++  }
++  return std::make_pair(CallUses, CallUsesInLoop);
++}
++} // namespace
++
++// We have: 9 calculated features (the features here); 1 feature for each
++// instruction opcode; and 1 feature for each manually-identified sequence.
++// For the latter 2, we build a histogram: we count the number of
++// occurrences of each instruction opcode or succession of instructions,
++// respectively.
++// Note that instruction opcodes start from 1. For convenience, we also have
++// an always 0 feature for the '0' opcode, hence the extra 1.
++const size_t ACPOFIExtendedFeatures::FunctionFeatures::FeatureCount =
++    ImportantInstructionSuccessions.size() + getMaxInstructionID() + 1 +
++    static_cast<size_t>(
++        ACPOFIExtendedFeatures::NamedFloatFeatureIndex::NumNamedFloatFeatures) +
++    static_cast<size_t>(
++        ACPOFIExtendedFeatures::NamedFeatureIndex::NumNamedFeatures);
++
++void ACPOFIExtendedFeatures::updateLoopRelatedFeatures(Function &F,
++                                                       LoopInfo &LI,
++                                                       FunctionFeatures &FF) {
++  uint64_t LoopNum = std::distance(LI.begin(), LI.end());
++
++  uint64_t LoopInstrCount = 0;
++  uint64_t BlockWithMulSuccNum = 0;
++  uint64_t LoopLevelSum = 0;
++  for (auto &L : LI) {
++    LoopLevelSum += static_cast<uint64_t>(L->getLoopDepth());
++    FF[NamedFeatureIndex::MaxLoopDepth] =
++        std::max(FF[NamedFeatureIndex::MaxLoopDepth],
++                 static_cast<uint64_t>(L->getLoopDepth()));
++    for (const BasicBlock *BB : L->getBlocks()) {
++      unsigned SuccCount = std::distance(succ_begin(BB), succ_end(BB));
++      if (SuccCount > 1)
++        BlockWithMulSuccNum++;
++      LoopInstrCount += std::distance(BB->instructionsWithoutDebug().begin(),
++                                      BB->instructionsWithoutDebug().end());
++    }
++  }
++
++  FF[NamedFeatureIndex::Loops] = LoopNum;
++  if (LoopNum != 0) {
++    uint64_t q = LoopInstrCount / LoopNum;
++    FF[NamedFloatFeatureIndex::InstrPerLoop] =
++        q + ((float)(LoopInstrCount - q * LoopNum)) / LoopNum;
++    q = BlockWithMulSuccNum / LoopNum;
++    FF[NamedFloatFeatureIndex::BlockWithMultipleSuccecorsPerLoop] =
++        q + ((float)(BlockWithMulSuccNum - q * LoopNum)) / LoopNum;
++    q = LoopLevelSum / LoopNum;
++    FF[NamedFloatFeatureIndex::AvgNestedLoopLevel] =
++        q + ((float)(LoopLevelSum - q * LoopNum)) / LoopNum;
++  }
++}
++
++void ACPOFIExtendedFeatures::updateBBLoopCallsiteBFFeatures(
++    Function &F, FunctionFeatures &FF, LoopInfo &LI,
++    FunctionAnalysisManager *FAM) {
++  // Initializations before looping
++  unsigned NumCallsiteInLoop = 0;
++  unsigned NumCallsite = 0;
++  uint64_t MaxCallsiteBlockFreq = 0;
++  uint64_t InstrNum = 0;
++  uint64_t SuccNum = 0;
++  uint64_t VecNum = 0;
++  uint64_t BlockNum = F.size();
++  auto getPairIndex = [](size_t a, size_t b) {
++    auto I = llvm::find(ImportantInstructionSuccessions, std::make_pair(a, b));
++    if (I == ImportantInstructionSuccessions.end())
++      return -1;
++    return static_cast<int>(
++        std::distance(ImportantInstructionSuccessions.begin(), I));
++  };
++  int StartID = 0;
++  int LastID = StartID;
++
++  // We don't want debug calls, because they'd just add noise.
++  // Sum number of instructions and successors on the way
++  for (auto &BB : F) {
++    SuccNum += std::distance(succ_begin(&BB), succ_end(&BB));
++    for (auto &I : BB.instructionsWithoutDebug()) {
++      if (CallBase *CB = dyn_cast<CallBase>(&I)) {
++        Function *Callee = CB->getCalledFunction();
++        if (Callee && !Callee->isIntrinsic()) {
++          ++NumCallsite;
++          if (!Callee->isDeclaration()) {
++            // Check all the functions that was called and get the max block
++            // frequency.
++            uint64_t EntryFreq =
++                FAM->getResult<BlockFrequencyAnalysis>(*Callee)
++                          .getEntryFreq();
++            MaxCallsiteBlockFreq = std::max(EntryFreq, MaxCallsiteBlockFreq);
++          }
++
++          if (Callee != nullptr) {
++            // Collect the number of callsites that were invoked with a pointer
++            // argument.
++            for (auto arg = Callee->arg_begin(); arg != Callee->arg_end();
++                 arg++)
++              if (isa<PointerType>(arg->getType())) {
++                FF[NamedFeatureIndex::PtrCallee]++;
++                break;
++              }
++          }
++
++          // Collect the number of callsites that returns a pointer type.
++          if (isa<PointerType>(CB->getType())) {
++            FF[NamedFeatureIndex::CallReturnPtr]++;
++          }
++
++          // Check if the given function is recursive.
++          if (&F == Callee) {
++            FF[NamedFeatureIndex::IsRecursive] = 1;
++          }
++
++          BasicBlock *BB = CB->getParent();
++          // if we found a loop for the BB that Call is in, we do +1
++          if (LI.getLoopFor(BB) != nullptr) {
++            ++NumCallsiteInLoop;
++          }
++        }
++      }
++
++      auto ID = I.getOpcode();
++      ++FF.InstructionHistogram[ID];
++      int PairIndex = getPairIndex(LastID, ID);
++      if (PairIndex >= 0)
++        ++FF.InstructionPairHistogram[PairIndex];
++      LastID = ID;
++      InstrNum++;
++      unsigned NumOp = I.getNumOperands();
++
++      // If instruction contains vector operand, consider it as a vector
++      // instruction
++      for (unsigned i = 0; i < NumOp; i++) {
++        if (isa<VectorType>(I.getOperand(i)->getType())) {
++          VecNum++;
++          break;
++        }
++      }
++
++      // If this is a conditional branch, check if it uses an argument
++      if (const auto II = dyn_cast<BranchInst>(&I))
++        if (II->isConditional()) {
++          FF[NamedFeatureIndex::ConditionalBranch]++;
++          // find the instruction where the condition is defined.
++          if (auto def = dyn_cast<Instruction>(II->getCondition())) {
++            // For all operands of def check if isa<Argument> (operand) then
++            // increment CBwithArg.
++            bool found = false;
++            for (unsigned i = 0; i < def->getNumOperands(); i++) {
++              if (isa<Argument>(def->getOperand(i))) {
++                FF[NamedFeatureIndex::CBwithArg]++;
++                found = true;
++                break;
++              }
++            }
++            if (found)
++              break;
++          }
++        }
++    }
++  }
++
++  FF[NamedFloatFeatureIndex::AvgVecInstr] = (float)VecNum / InstrNum;
++  FF[NamedFeatureIndex::Blocks] = BlockNum;
++  if (BlockNum > 0) {
++    uint64_t q = InstrNum / BlockNum;
++    FF[NamedFloatFeatureIndex::InstructionPerBlock] =
++        q + ((float)(InstrNum - q * BlockNum)) / BlockNum;
++    q = SuccNum / BlockNum;
++    FF[NamedFloatFeatureIndex::SuccessorPerBlock] =
++        q + ((float)(SuccNum - q * BlockNum)) / BlockNum;
++  }
++
++  FF[NamedFeatureIndex::MaxCallsiteBlockFreq] = MaxCallsiteBlockFreq;
++  FF[NamedFeatureIndex::NumCallsiteInLoop] = NumCallsiteInLoop;
++  FF[NamedFeatureIndex::Calls] = NumCallsite;
++}
++
++ACPOFIExtendedFeatures::FunctionFeatures
++ACPOFIExtendedFeatures::getFunctionFeatures(
++    Function &F, DominatorTree &DomTree, TargetTransformInfo &TTI, LoopInfo &LI,
++    FunctionAnalysisManager *FAM,
++    bool ValidSize, bool ValidLoop, bool ValidTree) {
++  assert(llvm::is_sorted(ImportantInstructionSuccessions) &&
++         "expected function features are sorted");
++
++  FunctionFeatures FF;
++  size_t InstrCount = getMaxInstructionID() + 1;
++  FF.InstructionHistogram.resize(InstrCount);
++  FF.InstructionPairHistogram.resize(ImportantInstructionSuccessions.size());
++
++  // check all the argument to see if there is a pointer type
++  for (auto arg = F.arg_begin(); arg != F.arg_end(); arg++) {
++    if (isa<PointerType>(arg->getType())) {
++      FF[NamedFeatureIndex::PtrArgs]++;
++    }
++  }
++
++  std::pair<int, int> ValidCallAndInLoopCounts =
++      getValidCallUsesAndInLoopCounts(F, FAM);
++  if (!ValidSize)
++    FF[NamedFeatureIndex::InitialSize] = getSize(F, TTI);
++  FF[NamedFeatureIndex::IsLocal] = F.hasLocalLinkage();
++  FF[NamedFeatureIndex::IsLinkOnceODR] = F.hasLinkOnceODRLinkage();
++  FF[NamedFeatureIndex::IsLinkOnce] = F.hasLinkOnceLinkage();
++  if (!ValidTree)
++    FF[NamedFeatureIndex::MaxDomTreeLevel] =
++        getMaxDominatorTreeDepth(F, DomTree);
++  FF[NamedFeatureIndex::CallUsage] = ValidCallAndInLoopCounts.first;
++  FF[NamedFeatureIndex::NumOfCallUsesInLoop] = ValidCallAndInLoopCounts.second;
++  FF[NamedFeatureIndex::EntryBlockFreq] =
++      FAM->getResult<BlockFrequencyAnalysis>(F)
++                .getEntryFreq();
++  ACPOFIExtendedFeatures::updateBBLoopCallsiteBFFeatures(F, FF, LI, FAM);
++  if (!ValidLoop)
++    ACPOFIExtendedFeatures::updateLoopRelatedFeatures(F, LI, FF);
++  return FF;
++}
++
++static int getCallHeight(Module &M, CallHeight *CH, Function *F) {
++  if (CH == nullptr) {
++    // If we don't have cached result (for ex, running with opt)
++    // We re-calculate the function level
++    // Or using the old pass manager
++    CallHeight CH = CallHeight(M);
++    return CH.getLevel(*F);
++  }
++  return CH->getLevel(*F);
++}
++
++void dumpInstructionPairs(raw_fd_ostream &OS) {
++  for (size_t i = 0; i < ImportantInstructionSuccessions.size(); i++) {
++    std::pair<uint64_t, uint64_t> pair = ImportantInstructionSuccessions[i];
++    OS << "{" << Instruction::getOpcodeName(pair.first) << ", "
++       << Instruction::getOpcodeName(pair.second) << "} ";
++  }
++  OS << "\n";
++}
++
++void dumpFunctionFeatures(raw_fd_ostream &OS,
++                          ACPOFIExtendedFeatures::FunctionFeatures &FF,
++                          Function &F, bool Verbose) {
++  if (Verbose) {
++    OS << "Function Name: " << F.getName() << "\n";
++    OS << "FeatureCount: " << FF.FeatureCount << "\n";
++    OS << "\nAverage instructions per basic block: "
++       << FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::
++                 InstructionPerBlock]
++       << "\nAverage number of successors per block: "
++       << FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::SuccessorPerBlock]
++       << "\nAverage number of vector instructions per instruction: "
++       << FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::AvgVecInstr]
++       << "\nAverage nest level per loop: "
++       << FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::AvgNestedLoopLevel]
++       << "\nAverage instructions per loop: "
++       << FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::InstrPerLoop]
++       << "\nAverage blocks with multiple succssors per loop: "
++       << FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::
++                 BlockWithMultipleSuccecorsPerLoop]
++       << "\nInitial Size: "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::InitialSize] << "\n"
++       << "Blocks: " << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::Blocks]
++       << "\n"
++       << "Calls (Number of callsites): "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::Calls] << "\n"
++       << "IsLocal: " << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::IsLocal]
++       << "\n"
++       << "IsLinkOnceODR: "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::IsLinkOnceODR] << "\n"
++       << "IsLinkOnce: "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::IsLinkOnce] << "\n"
++       << "Loops: " << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::Loops]
++       << "\n"
++       << "MaxLoopDepth: "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::MaxLoopDepth] << "\n"
++       << "MaxDomTreeLevel: "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::MaxDomTreeLevel]
++       << "\nPointer arguments of this caller: "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::PtrArgs]
++       << "\nCallees with pointer arguments: "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::PtrCallee]
++       << "\nCallees that return a pointer: "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::CallReturnPtr]
++       << "\nConditional Branches: "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::ConditionalBranch]
++       << "\nConditional Branches that depends on an argument: "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::CBwithArg]
++       << "\nCaller Height of the current function: "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::CallerHeight]
++       << "\nNumber of explict calls to this function: "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::CallUsage]
++       << "\nIs recursive: "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::IsRecursive]
++       << "\nNumber of callsites that are inside loop in this function: "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::NumCallsiteInLoop]
++       << "\nNumber of explict calls to this function that are in loop: "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::NumOfCallUsesInLoop]
++       << "\nBlock Frequency for the first block of this function: "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::EntryBlockFreq]
++       << "\nMaximum of all callsites' entry Block Frequency: "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::MaxCallsiteBlockFreq]
++       << "\n";
++    OS << "InstructionHistogram: ";
++    OS << "Size: " << FF.InstructionHistogram.size() << "\n";
++    for (size_t i = 0; i < FF.InstructionHistogram.size(); i++) {
++      OS << FF.InstructionHistogram[i] << " ";
++    }
++    OS << "\n";
++    OS << "InstructionPairHistogram: ";
++    OS << "Size: " << FF.InstructionPairHistogram.size() << "\n";
++    for (size_t i = 0; i < FF.InstructionPairHistogram.size(); i++) {
++      OS << FF.InstructionPairHistogram[i] << " ";
++    }
++    OS << "\n\n";
++  } else {
++    OS << F.getName() << " "
++       << FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::
++                 InstructionPerBlock]
++       << " "
++       << FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::SuccessorPerBlock]
++       << " " << FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::AvgVecInstr]
++       << " "
++       << FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::AvgNestedLoopLevel]
++       << " "
++       << FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::InstrPerLoop]
++       << " "
++       << FF[ACPOFIExtendedFeatures::NamedFloatFeatureIndex::
++                 BlockWithMultipleSuccecorsPerLoop]
++       << " " << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::InitialSize]
++       << " " << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::Blocks] << " "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::Calls] << " "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::IsLocal] << " "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::IsLinkOnceODR] << " "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::IsLinkOnce] << " "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::Loops] << " "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::MaxLoopDepth] << " "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::MaxDomTreeLevel] << " "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::PtrArgs] << " "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::PtrCallee] << " "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::CallReturnPtr] << " "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::ConditionalBranch]
++       << " " << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::CBwithArg] << " "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::CallerHeight] << " "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::CallUsage] << " "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::IsRecursive] << " "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::NumCallsiteInLoop]
++       << " "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::NumOfCallUsesInLoop]
++       << " " << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::EntryBlockFreq]
++       << " "
++       << FF[ACPOFIExtendedFeatures::NamedFeatureIndex::MaxCallsiteBlockFreq]
++       << " ";
++
++    for (size_t i = 0; i < FF.InstructionHistogram.size(); i++) {
++      OS << FF.InstructionHistogram[i] << " ";
++    }
++    for (size_t i = 0; i < FF.InstructionPairHistogram.size(); i++) {
++      OS << FF.InstructionPairHistogram[i] << " ";
++    }
++    OS << "\n";
++  }
++}
++
++void runAndDump(raw_fd_ostream &OS, Function *F, DominatorTree &DomTree,
++                TargetTransformInfo &TTI, LoopInfo &LI, Module &M,
++                CallHeight *CH, FunctionAnalysisManager *FAM = nullptr) {
++  struct ACPOFIExtendedFeatures::FunctionFeatures FF =
++      ACPOFIExtendedFeatures::getFunctionFeatures(*F, DomTree, TTI, LI, FAM);
++  // Get the call height feature
++  FF[ACPOFIExtendedFeatures::NamedFeatureIndex::CallerHeight] =
++      getCallHeight(M, CH, F);
++  dumpFunctionFeatures(OS, FF, *F, Verbose);
++}
++
++std::unique_ptr<raw_fd_ostream> setUpOS() {
++  // Check ACPO/llvm-project issue #112
++  std::error_code FileErr;
++  std::unique_ptr<raw_fd_ostream> OS(
++      new raw_fd_ostream(OutFile.c_str(), FileErr, llvm::sys::fs::OF_Append));
++
++  if (FileErr) {
++    llvm::errs() << "Error opening info file " << OutFile.c_str() << ": "
++                 << FileErr.message() << "\n";
++    return nullptr;
++  }
++
++  if (CheckPairHisto) {
++    dumpInstructionPairs(*OS);
++    return nullptr;
++  }
++
++  return OS;
++}
++
++PreservedAnalyses DumpFeaturePass::run(LazyCallGraph::SCC &C,
++                                       CGSCCAnalysisManager &AM,
++                                       LazyCallGraph &CG,
++                                       CGSCCUpdateResult &UR) {
++  std::unique_ptr<raw_fd_ostream> OS = setUpOS();
++  if (!OS)
++    return PreservedAnalyses::all();
++
++  FunctionAnalysisManager &FAM =
++      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
++
++  const auto &MAMProxy = AM.getResult<ModuleAnalysisManagerCGSCCProxy>(C, CG);
++  Module &M = *C.begin()->getFunction().getParent();
++  CallHeight *CH = MAMProxy.getCachedResult<CallHeightAnalysis>(M);
++  for (LazyCallGraph::Node &N : C) {
++    Function *F = &N.getFunction();
++    if (F->empty()) {
++      continue;
++    }
++
++    auto &DomTree = FAM.getResult<DominatorTreeAnalysis>(*F);
++    auto &TTI = FAM.getResult<TargetIRAnalysis>(*F);
++    auto &LI = FAM.getResult<LoopAnalysis>(*F);
++
++    runAndDump(*OS, F, DomTree, TTI, LI, M, CH, &FAM);
++  }
++  return PreservedAnalyses::all();
++}
++
++ACPOFIExtendedFeatures::NamedFeatureIndex &
++llvm::operator++(ACPOFIExtendedFeatures::NamedFeatureIndex &n) {
++  return n = static_cast<ACPOFIExtendedFeatures::NamedFeatureIndex>((int)n + 1);
++}
++
++ACPOFIExtendedFeatures::NamedFeatureIndex
++operator++(ACPOFIExtendedFeatures::NamedFeatureIndex &n, int) {
++  ACPOFIExtendedFeatures::NamedFeatureIndex res = n;
++  ++n;
++  return res;
++}
++
++ACPOFIExtendedFeatures::NamedFloatFeatureIndex &
++llvm::operator++(ACPOFIExtendedFeatures::NamedFloatFeatureIndex &n) {
++  return n = static_cast<ACPOFIExtendedFeatures::NamedFloatFeatureIndex>((int)n + 1);
++}
++
++ACPOFIExtendedFeatures::NamedFloatFeatureIndex
++operator++(ACPOFIExtendedFeatures::NamedFloatFeatureIndex &n, int) {
++  ACPOFIExtendedFeatures::NamedFloatFeatureIndex res = n;
++  ++n;
++  return res;
++}
+diff --git a/llvm/lib/Analysis/ModelDataCollector.cpp b/llvm/lib/Analysis/ModelDataCollector.cpp
+new file mode 100644
+index 000000000000..5d599bff25a4
+--- /dev/null
++++ b/llvm/lib/Analysis/ModelDataCollector.cpp
+@@ -0,0 +1,350 @@
++//===- ModelDataCollector.cpp - Data collector for ML model  --------------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++//
++// This file implements the collection and dumping of data for the ML models
++//
++//===----------------------------------------------------------------------===//
++
++#if defined(ENABLE_ACPO)
++#include "llvm/Analysis/LoopInfo.h"
++#include "llvm/Analysis/ModelDataCollector.h"
++#include "llvm/Demangle/Demangle.h"
++#include "llvm/IR/Function.h"
++#include "llvm/Support/CommandLine.h"
++#include "llvm/Support/FileSystem.h"
++#include "llvm/Support/Path.h"
++
++using namespace llvm;
++
++#define DEBUG_TYPE "model-data-collector"
++
++// Defined in 'lib/IR/AsmWriter.cpp'
++extern cl::opt<std::string> UnnamedVariablePrefix;
++
++static cl::opt<std::string> IRFileDirectory(
++    "IR-file-directory", cl::Hidden,
++    cl::desc("Name of a directory to store IR files."));
++
++cl::opt<std::string>
++    ACPOModelFile("acpo-dump-file", cl::init("-"), cl::Hidden,
++                  cl::desc("Name of a file to store feature data in."));
++
++std::string ModelDataCollector::getDumpOptionAsString(DumpOption DO) {
++  switch (DO) {
++    case DumpOption::loop:
++      return "loop";
++    case DumpOption::function:
++      return "function";
++    case DumpOption::before:
++      return "before";
++    case DumpOption::after:
++      return "after";
++    default:
++      return "";
++  }
++}
++
++std::vector<std::pair<std::string, std::string>> ModelDataCollector::getFeatures() {
++  return Features;
++}
++
++StringMap<std::string> ModelDataCollector::getIRFileNameMap() {
++  return IRFileNames;
++}
++
++std::string ModelDataCollector::getOutputFileName() { return OutputFileName; }
++
++bool ModelDataCollector::isEmptyOutputFile() {
++  if (OutputFileName.empty())
++    return false;
++
++  if (!sys::fs::exists(OutputFileName))
++    return true;
++
++  uint64_t Size;
++  std::error_code EC = sys::fs::file_size(OutputFileName, Size);
++  if (EC) {
++    llvm::errs() << "Cannot get file size: " << EC.message() << "\n";
++    assert(false && "Cannot get file size.");
++  }
++
++  if (Size == 0)
++    return true;
++
++  return false;
++}
++
++void ModelDataCollector::collectFeatures(Loop *L, const std::string &ModuleName,
++                        const std::string &FuncName, const std::string &LoopName) {
++}
++
++void ModelDataCollector::collectFeatures() {
++  for (auto &FeatureCollectInfo : FeatureCollectInfos) {
++    ACPOCollectFeatures::FeatureValueMap FeatureMap;
++
++    if (FeatureCollectInfo->FeaturesInfo.get()) {
++      FeatureMap = FeatureCollectInfo->FeatureCollector->getFeaturesPair(
++          *FeatureCollectInfo->FeaturesInfo.get());
++    } else if (FeatureCollectInfo->RegisteredScopes.get()) {
++      FeatureCollectInfo->FeatureCollector->setGlobalFeatureInfo(
++          *FeatureCollectInfo->GlobalInfo.get());
++      FeatureMap = FeatureCollectInfo->FeatureCollector->getFeaturesPair(
++          *FeatureCollectInfo->RegisteredScopes.get());
++    } else if (FeatureCollectInfo->RegisteredGroupIDs.get()) {
++      FeatureCollectInfo->FeatureCollector->setGlobalFeatureInfo(
++          *FeatureCollectInfo->GlobalInfo.get());
++      FeatureMap = FeatureCollectInfo->FeatureCollector->getFeaturesPair(
++          *FeatureCollectInfo->RegisteredGroupIDs.get());
++    } else {
++      outs() << "No Features are collected, since the given "
++                "FeatureCollectInfo is invalid.\n";
++      return;
++    }
++    for (auto const &[key, val] : FeatureMap) {
++      std::string FeatureName;
++
++      if (FeatureCollectInfo->Prefix != "")
++        FeatureName += FeatureCollectInfo->Prefix + "_";
++
++      FeatureName += ACPOCollectFeatures::getFeatureName(key);
++
++      if (FeatureCollectInfo->Postfix != "")
++        FeatureName += "_" + FeatureCollectInfo->Postfix;
++
++      Features.insert(Features.end(), {std::make_pair(FeatureName, val)});
++    }
++  }
++}
++
++void ModelDataCollector::registerFeature(ACPOCollectFeatures::FeaturesInfo Info,
++                                         std::string Pre, std::string Post) {
++  std::unique_ptr<ModelDataCollector::FeatureCollectInfo> tmp =
++      std::make_unique<ModelDataCollector::FeatureCollectInfo>();
++  tmp->FeaturesInfo.reset(new ACPOCollectFeatures::FeaturesInfo{Info});
++  tmp->FeatureCollector.reset(new ACPOCollectFeatures{});
++  tmp->Prefix = Pre;
++  tmp->Postfix = Post;
++
++  FeatureCollectInfos.push_back(std::move(tmp));
++}
++
++void ModelDataCollector::registerFeature(
++    ACPOCollectFeatures::Scopes ScopeVec,
++    ACPOCollectFeatures::FeatureInfo GlobalInfo, std::string Pre,
++    std::string Post) {
++  std::unique_ptr<ModelDataCollector::FeatureCollectInfo> tmp =
++      std::make_unique<ModelDataCollector::FeatureCollectInfo>();
++  tmp->RegisteredScopes.reset(new ACPOCollectFeatures::Scopes{ScopeVec});
++  tmp->FeatureCollector.reset(new ACPOCollectFeatures{});
++  tmp->GlobalInfo.reset(new ACPOCollectFeatures::FeatureInfo{GlobalInfo});
++  tmp->Prefix = Pre;
++  tmp->Postfix = Post;
++
++  FeatureCollectInfos.push_back(std::move(tmp));
++}
++
++void ModelDataCollector::registerFeature(
++    ACPOCollectFeatures::GroupIDs GroupIDVec,
++    ACPOCollectFeatures::FeatureInfo GlobalInfo, std::string Pre,
++    std::string Post) {
++  std::unique_ptr<ModelDataCollector::FeatureCollectInfo> tmp =
++      std::make_unique<ModelDataCollector::FeatureCollectInfo>();
++  tmp->RegisteredGroupIDs.reset(new ACPOCollectFeatures::GroupIDs{GroupIDVec});
++  tmp->FeatureCollector.reset(new ACPOCollectFeatures{});
++  tmp->GlobalInfo.reset(new ACPOCollectFeatures::FeatureInfo{GlobalInfo});
++  tmp->Prefix = Pre;
++  tmp->Postfix = Post;
++
++  FeatureCollectInfos.push_back(std::move(tmp));
++}
++
++void ModelDataCollector::resetRegisteredFeatures() {
++  FeatureCollectInfos.clear();
++  Features.clear();
++}
++
++std::string ModelDataCollector::demangleName(const std::string &Name) {
++  ItaniumPartialDemangler D;
++  if (!D.partialDemangle(Name.c_str()))
++    return D.getFunctionBaseName(nullptr, nullptr);
++
++  return Name;
++}
++
++void ModelDataCollector::setFeatures(
++                std::vector<std::pair<std::string, std::string>> NewFeatures) {
++  Features = NewFeatures;
++}
++
++void ModelDataCollector::addFeatures(
++                std::vector<std::pair<std::string, std::string>> NewFeatures) {
++  Features.insert(Features.end(), NewFeatures.begin(), NewFeatures.end());
++}
++
++void ModelDataCollector::setIRFileNameMap(StringMap<std::string> IRFileNameMap) {
++  IRFileNames = IRFileNameMap;
++}
++
++void ModelDataCollector::printRow(bool printHeader) {
++  // Print the IR file names first
++  for (const auto &P : IRFileNames) {
++    if (printHeader)
++      Out << P.getKey();
++    else
++      Out << P.getValue();
++
++    Out << ",";
++  }
++
++  for (unsigned I = 0, E = Features.size(); I != E; ++I ) {
++    // First value does not get a comma
++    if (I)
++      Out << ",";
++
++    if (printHeader)
++      Out << Features.at(I).first;
++    else
++      Out << Features.at(I).second;
++  }
++
++  Out << "\n";
++}
++
++/*std::string ModelDataCollector::generateIRFileName(autotuning::CodeRegion CR) {
++  // File name = source_location + pass_name + coderegion_type + hash,
++  // where source_location = file_name + func_name + loop_name
++  //                         + line_number + column_number
++  std::string IRFileName =
++      sys::path::filename(StringRef(CR.getFileName())).str() + "_"
++      + demangleName(CR.getFuncName()) + "_"
++      + CR.getName() + "_"
++      + std::to_string(CR.getSourceLoc().SourceLine) + "_"
++      + std::to_string(CR.getSourceLoc().SourceColumn) + "_"
++      + CR.getPassName() + "_"
++      + CR.getTypeAsString() + "_"
++      + std::to_string(CR.getHash()) + ".ll";
++  return IRFileName;
++}*/
++
++std::string ModelDataCollector::getIRFileName(StringRef Key) {
++  if (IRFileNames.count(Key))
++    return IRFileNames.find(Key)->second;
++
++  return "None";
++}
++
++std::unique_ptr<raw_ostream>
++ModelDataCollector::createFile(const Twine &FilePath,
++                               const Twine &FileName,
++                               std::error_code &EC) {
++  if (std::error_code EC = sys::fs::create_directories(FilePath))
++    errs() << "Error creating directory: " << FilePath << ": "
++           << EC.message() << "\n";
++
++  return std::make_unique<raw_fd_ostream>((FilePath + "/" + FileName).str(), EC);
++}
++
++void ModelDataCollector::createIRFileForLoop(Loop *L, const Twine &IRFilePath,
++                                             const Twine &IRFileName,
++                                             bool OverwriteIRFile) {
++  if (!OverwriteIRFile && sys::fs::exists(IRFilePath + "/" + IRFileName))
++    return;
++
++  // Write IR to file
++  std::error_code EC;
++  auto OS = createFile(IRFilePath, Twine(IRFileName), EC);
++  if (EC) {
++    errs() << "Error creating loop IR file: " << IRFileName << ": "
++            << EC.message() << "\n";
++    return;
++  }
++
++  // Print loop wrapped in function if -unnamed-var-prefix is set by user
++  if (UnnamedVariablePrefix.getNumOccurrences() > 0) {
++    SmallVector<BasicBlock *, 8> ExitBlocks;
++    L->getExitBlocks(ExitBlocks);
++    // May need to move this code out of Loop data structure in LLVM. Will see.
++    L->printWithFunctionWrapper(*OS, L->getHeader()->getParent(),
++                                L->getBlocks(), L->getHeader(), ExitBlocks,
++                                /*AAW*/ nullptr,
++                                /*ShouldPreserveUseListOrder*/ false,
++                                /*IsForDebug*/ false);
++  } else {
++    L->print(*OS, /*Depth*/ 0, /*Verbose*/ true);
++  }
++}
++
++void ModelDataCollector::createIRFileForFunction(Function *F,
++                                                 const Twine &IRFilePath,
++                                                 const Twine &IRFileName,
++                                                 bool OverwriteIRFile) {
++  if (!OverwriteIRFile && sys::fs::exists(IRFilePath + "/" + IRFileName))
++    return;
++
++  // Write IR to file
++  std::error_code EC;
++  auto OS = createFile(IRFilePath, Twine(IRFileName), EC);
++  if (EC) {
++    errs() << "Error creating function IR file: " << IRFileName << ": "
++            << EC.message() << "\n";
++    return;
++  }
++  // May need to investigate this print function change.
++  F->print(*OS, /*AAW*/ nullptr, /*ShouldPreserveUseListOrder*/ false,
++           /*IsForDebug*/ false);
++}
++
++void ModelDataCollector::writeIR(Loop *L, Function *F,
++                                 std::string NewIRFileName,
++                                 std::string PassName,
++                                 DumpOption DumpBeforeOrAfter, bool PrintLoop,
++                                 bool PrintFunction, bool OverwriteIRFile) {
++  // Create base directory first
++  SmallString<256> IRFilePath;
++  if (IRFileDirectory.getNumOccurrences() > 0) {
++    // Third priority is the directory specified by
++    // the -IR-file-directory option
++    Twine BaseDir(IRFileDirectory);
++    BaseDir.toVector(IRFilePath);
++  } else {
++    // No directory specified
++    return;
++  }
++
++  if (getDumpOptionAsString(DumpBeforeOrAfter).empty())
++    return;
++
++  // Create sub-directories to store corresponding IR files.
++  // Directory name = before/after + pass_name + coderegion_type
++  std::string SubDir = getDumpOptionAsString(DumpBeforeOrAfter)
++                                  + "_" + PassName;
++  if (L && PrintLoop) {
++    createIRFileForLoop(L,
++                        Twine(IRFilePath) + "/" + SubDir + "_" +
++                            getDumpOptionAsString(DumpOption::loop),
++                        Twine(NewIRFileName), OverwriteIRFile);
++    // Add IR file name for summary data file
++    IRFileNames.insert(std::pair<std::string, std::string> (
++                        getDumpOptionAsString(DumpBeforeOrAfter)
++                        + getDumpOptionAsString(DumpOption::loop),
++                        NewIRFileName));
++  }
++
++  if (F && PrintFunction) {
++    createIRFileForFunction(F,
++                            Twine(IRFilePath) + "/" + SubDir + "_" +
++                                getDumpOptionAsString(DumpOption::function),
++                            Twine(NewIRFileName), OverwriteIRFile);
++    // Add IR file name for summary data file
++    IRFileNames.insert(std::pair<std::string, std::string> (
++                        getDumpOptionAsString(DumpBeforeOrAfter)
++                        + getDumpOptionAsString(DumpOption::function),
++                        NewIRFileName));
++  }
++}
++#endif // ENABLE_ACPO
+diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
+index 9029dc7bb3d9..579074408b55 100644
+--- a/llvm/lib/CodeGen/CMakeLists.txt
++++ b/llvm/lib/CodeGen/CMakeLists.txt
+@@ -1,4 +1,4 @@
+-if (DEFINED LLVM_HAVE_TF_AOT OR LLVM_HAVE_TFLITE)
++if ((DEFINED LLVM_HAVE_TF_AOT OR DEFINED LLVM_HAVE_TF_API) AND (NOT ACPO_AOT))
+   include(TensorFlowCompile)
+   set(LLVM_RAEVICT_MODEL_PATH_DEFAULT "models/regalloc-eviction")
+ 
+diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
+index af77e6c2dc4d..a02c603a14a5 100644
+--- a/llvm/lib/IR/AsmWriter.cpp
++++ b/llvm/lib/IR/AsmWriter.cpp
+@@ -86,8 +86,16 @@
+ #include <utility>
+ #include <vector>
+ 
++#include "llvm/ADT/StringSet.h"
++#include "llvm/Analysis/LoopInfo.h"
++#include "llvm/Support/CommandLine.h"
++
+ using namespace llvm;
+ 
++cl::opt<std::string> UnnamedVariablePrefix(
++    "unnamed-var-prefix", cl::Hidden,
++    cl::desc("Specify the prefix added to unnamed variables"), cl::init(""));
++
+ // Make virtual table appear in this compilation unit.
+ AssemblyAnnotationWriter::~AssemblyAnnotationWriter() = default;
+ 
+@@ -2487,9 +2495,11 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
+     Slot = -1;
+   }
+ 
+-  if (Slot != -1)
+-    Out << Prefix << Slot;
+-  else
++  if (Slot != -1) {
++    // By default, UnnamedVariablePrefix is empty so it matches original behaviour
++    // unless specified.
++    Out << Prefix << UnnamedVariablePrefix << Slot;
++  } else
+     Out << "<badref>";
+ }
+ 
+@@ -2602,12 +2612,13 @@ public:
+   void writeAllAttributeGroups();
+ 
+   void printTypeIdentities();
+-#if defined(ENABLE_AUTOTUNER)
++#if defined(ENABLE_AUTOTUNER) || defined(ENABLE_ACPO)
+   void printGlobal(const GlobalVariable *GV, bool PrintDeclarationOnly = false);
+   void printAlias(const GlobalAlias *GA);
+   void printIFunc(const GlobalIFunc *GI);
+   void printComdat(const Comdat *C);
+-  void printRequisiteDeclarations(const Function *F);
++  void printRequisiteDeclarations(const Function *F,
++                                  std::vector<BasicBlock *> LoopBlocks = {});  
+   void printFunction(const Function *F, bool PrintCompleteIR = false,
+                      bool PrintDeclarationOnly = false);
+ #else
+@@ -2616,9 +2627,15 @@ public:
+   void printIFunc(const GlobalIFunc *GI);
+   void printComdat(const Comdat *C);
+   void printFunction(const Function *F);
++#endif
++#if defined(ENABLE_ACPO)
++  void printLoopWithFunctionWrapper(Function *F,
++                                    std::vector<BasicBlock *> LoopBlocks,
++                                    BasicBlock *Header,
++                                    SmallVector<BasicBlock *, 8> ExitBlocks);
+ #endif
+   void printArgument(const Argument *FA, AttributeSet Attrs);
+-  void printBasicBlock(const BasicBlock *BB);
++  void printBasicBlock(const BasicBlock *BB, bool PrintLabelOnly = false);
+   void printInstructionLine(const Instruction &I);
+   void printInstruction(const Instruction &I);
+ 
+@@ -3603,7 +3620,7 @@ static void maybePrintComdat(formatted_raw_ostream &Out,
+   Out << ')';
+ }
+ 
+-#if defined(ENABLE_AUTOTUNER)
++#if defined(ENABLE_AUTOTUNER) || defined(ENABLE_ACPO)
+ void AssemblyWriter::printGlobal(const GlobalVariable *GV,
+                                  bool PrintDeclarationOnly) {
+   if (GV->isMaterializable() && !PrintDeclarationOnly)
+@@ -3617,7 +3634,7 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
+   WriteAsOperandInternal(Out, GV, WriterCtx);
+   Out << " = ";
+ 
+-#if defined(ENABLE_AUTOTUNER)
++#if defined(ENABLE_AUTOTUNER) || defined(ENABLE_ACPO)
+   if ((!GV->hasInitializer() || PrintDeclarationOnly) &&
+       GV->hasExternalLinkage())
+ #else
+@@ -3640,7 +3657,7 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
+   Out << (GV->isConstant() ? "constant " : "global ");
+   TypePrinter.print(GV->getValueType(), Out);
+ 
+-#if defined(ENABLE_AUTOTUNER)
++#if defined(ENABLE_AUTOTUNER) || defined(ENABLE_ACPO)
+   if (GV->hasInitializer() && !PrintDeclarationOnly) {
+ #else
+   if (GV->hasInitializer()) {
+@@ -3794,21 +3811,34 @@ void AssemblyWriter::printTypeIdentities() {
+   }
+ }
+ 
+-#if defined(ENABLE_AUTOTUNER)
++#if defined(ENABLE_AUTOTUNER) || defined(ENABLE_ACPO)
+ /// printRequisiteDeclarations - Print the declarations of type identities,
+ /// global variables, functions, and function attribute groups of a function.
+-void AssemblyWriter::printRequisiteDeclarations(const Function *F) {
++void AssemblyWriter::printRequisiteDeclarations(
++    const Function *F, std::vector<BasicBlock *> LoopBlocks) {
+   // walk through instructions and collect global variables & functions
+   SmallPtrSet<GlobalVariable *, 8> GVs;
+   SmallPtrSet<Function *, 8> Functions;
+-  for (const BasicBlock &BB : *F) {
+-    for (const Instruction &I : BB) {
++  std::vector<BasicBlock *> BasicBlocks;
++  if (!LoopBlocks.empty()) {
++    for (BasicBlock *BB : LoopBlocks)
++      BasicBlocks.push_back(BB);
++  } else {
++    for (const BasicBlock &BB : *F)
++      BasicBlocks.push_back(const_cast<BasicBlock *>(&BB));
++  }
++
++  for (const BasicBlock *BB : BasicBlocks) {
++    for (const Instruction &I : *BB) {
+       // Check for function
+       if (const auto *CI = dyn_cast<CallInst>(&I)) {
+         Function *func = CI->getCalledFunction();
+         if (func)
+           Functions.insert(func);
+       }
++      if (const InvokeInst *II = dyn_cast<InvokeInst>(&I))
++        if (Function *func = dyn_cast<Function>(II->getCalledOperand()))
++          Functions.insert(func);
+       // Check for global variables
+       for (const Use &U : I.operands()) {
+         if (GlobalVariable *gv = dyn_cast<GlobalVariable>(U))
+@@ -3823,6 +3853,16 @@ void AssemblyWriter::printRequisiteDeclarations(const Function *F) {
+               GVs.insert(gv);
+           }
+         }
++        // Check for ConstantExpr BitCast
++        if (const auto *CstExpr = dyn_cast<ConstantExpr>(U))
++          if (CstExpr->isCast())
++            for (const Use &UU : CstExpr->operands()) {
++              if (GlobalVariable *gv = dyn_cast<GlobalVariable>(UU))
++                GVs.insert(gv);
++              else if (const Function *func =
++                           dyn_cast<Function>(CstExpr->stripPointerCasts()))
++                Functions.insert(const_cast<Function *>(func));
++            }
+       }
+     }
+   }
+@@ -3842,7 +3882,7 @@ void AssemblyWriter::printRequisiteDeclarations(const Function *F) {
+       // modify property if needed
+       if (!(*GVit)->hasAvailableExternallyLinkage() &&
+           !((*GVit)->getName() == "llvm.global_ctors") &&
+-          (*GVit)->hasLocalLinkage()) {
++          ((*GVit)->hasLocalLinkage() || (*GVit)->hasCommonLinkage())) {
+         (*GVit)->setLinkage(GlobalValue::ExternalLinkage);
+         (*GVit)->setVisibility(GlobalValue::HiddenVisibility);
+       }
+@@ -3860,8 +3900,14 @@ void AssemblyWriter::printRequisiteDeclarations(const Function *F) {
+   // print functions
+   for (auto FuncIt = Functions.begin(), et = Functions.end(); FuncIt != et;
+        ++FuncIt) {
++    if (!LoopBlocks.empty() && *FuncIt == F)
++      continue;
+     Out << '\n';
++    GlobalValue::LinkageTypes SavedLinkage = (*FuncIt)->getLinkage();
++    // Function declarations can only have external or extern_weak linkage
++    (*FuncIt)->setLinkage(GlobalValue::ExternalLinkage);
+     printFunction(*FuncIt, false, true);
++    (*FuncIt)->setLinkage(SavedLinkage);
+   }
+ 
+   // Write attribute groups.
+@@ -3873,7 +3919,8 @@ void AssemblyWriter::printRequisiteDeclarations(const Function *F) {
+ }
+ 
+ /// printFunction - Print all aspects of a function.
+-void AssemblyWriter::printFunction(const Function *F, bool PrintCompleteIR,
++void AssemblyWriter::printFunction(const Function *F,
++                                   bool PrintCompleteIR,
+                                    bool PrintDeclarationOnly) {
+   if (PrintCompleteIR && !PrintDeclarationOnly) {
+     printRequisiteDeclarations(F);
+@@ -3887,6 +3934,9 @@ void AssemblyWriter::printFunction(const Function *F, bool PrintCompleteIR,
+ void AssemblyWriter::printFunction(const Function *F) {
+   if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out);
+ 
++  if (AnnotationWriter)
++    AnnotationWriter->emitFunctionAnnot(F, Out);
++
+   if (F->isMaterializable())
+     Out << "; Materializable\n";
+ #endif
+@@ -3907,7 +3957,7 @@ void AssemblyWriter::printFunction(const Function *F) {
+       Out << "; Function Attrs: " << AttrStr << '\n';
+   }
+ 
+-#if defined(ENABLE_AUTOTUNER)
++#if defined(ENABLE_AUTOTUNER) || defined(ENABLE_ACPO)
+   if (!PrintDeclarationOnly)
+     Machine.incorporateFunction(F);
+ 
+@@ -3952,7 +4002,7 @@ void AssemblyWriter::printFunction(const Function *F) {
+   Out << '(';
+ 
+   // Loop over the arguments, printing them...
+-#if defined(ENABLE_AUTOTUNER)
++#if defined(ENABLE_AUTOTUNER) || defined(ENABLE_ACPO)
+   if ((F->isDeclaration() && !IsForDebug) || PrintDeclarationOnly) {
+ #else
+   if (F->isDeclaration() && !IsForDebug) {
+@@ -4027,7 +4077,7 @@ void AssemblyWriter::printFunction(const Function *F) {
+     writeOperand(F->getPersonalityFn(), /*PrintType=*/true);
+   }
+ 
+-#if defined(ENABLE_AUTOTUNER)
++#if defined(ENABLE_AUTOTUNER) || defined(ENABLE_ACPO)
+   if (F->isDeclaration() || PrintDeclarationOnly) {
+ #else
+   if (F->isDeclaration()) {
+@@ -4049,16 +4099,102 @@ void AssemblyWriter::printFunction(const Function *F) {
+     Out << "}\n";
+   }
+ 
+-#if defined(ENABLE_AUTOTUNER)
++#if defined(ENABLE_AUTOTUNER) || defined(ENABLE_ACPO)
+   // Output metadata
+   if (!Machine.mdn_empty() && PrintCompleteIR && !PrintDeclarationOnly) {
+     Out << '\n';
+     writeAllMDNodes();
+   }
+-#endif
++  if (!PrintDeclarationOnly)
++    Machine.purgeFunction();
++#else
+   Machine.purgeFunction();
++#endif
+ }
+ 
++#if defined(ENABLE_ACPO)
++/// printLoopWithFunctionWrapper - print out a loop wrapped in a dummy
++/// function. All global/local variables, functions and metadata that
++/// are referenced inside the loop are printed out. Loop predecessors
++/// and loop exit blocks are also included.
++void AssemblyWriter::printLoopWithFunctionWrapper(
++    Function *F, std::vector<BasicBlock *> LoopBlocks, BasicBlock *Header,
++    SmallVector<BasicBlock *, 8> ExitBlocks) {
++  printRequisiteDeclarations(F, LoopBlocks);
++
++  // Output the dummy function
++  bool IsFirstArgument = true;
++  Out << "define void ";
++  std::string FunctionName = "foo";
++  PrintLLVMName(Out, FunctionName, GlobalPrefix);
++  Out << '(';
++  for (const Argument &Arg : F->args()) {
++    if (IsFirstArgument) // Add commas if there are more than one
++      IsFirstArgument = false;
++    else
++      Out << ", ";
++      Out << &Arg;
++  }
++
++  // All local variables referenced in this loop but are not declared here
++  // are printed next in the argument list
++  SmallPtrSet<const Instruction *, 32> AddedVariables;
++  for (const BasicBlock *BB : LoopBlocks)
++    for (const Instruction &I : *BB)
++      for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
++        Value *Op = I.getOperand(i);
++        // Print out the operand in the function argument list
++        // if it is an instruction that is not contained in this loop
++        if (const Instruction *II = dyn_cast_or_null<Instruction>(Op))
++          if (!AddedVariables.contains(II) &&
++              std::find(LoopBlocks.begin(), LoopBlocks.end(),
++                        II->getParent()) != LoopBlocks.end()) {
++            AddedVariables.insert(II);
++            if (IsFirstArgument)
++              IsFirstArgument = false;
++            else
++              Out << ", ";
++
++            writeOperand(Op, true);
++          }
++      }
++
++  Out << ") {\n";
++
++  // Output loop predecessors
++  // Each predecessor only needs to have an unconditional 'br' instruction
++  // that branches to the loop header
++  for (const BasicBlock *Pred : children<Inverse<BasicBlock *>>(Header))
++    // If the block is not in the loop
++    if (std::find(LoopBlocks.begin(), LoopBlocks.end(), Pred) !=
++        LoopBlocks.end()) {
++      printBasicBlock(Pred, true);
++      Out << "  br label %";
++      PrintLLVMName(Out, Header->getName(), LabelPrefix);
++      Out << "\n";
++    }
++
++  // Output all of the loop's basic blocks
++  for (const BasicBlock *BB : LoopBlocks)
++    printBasicBlock(BB);
++
++  // Output loop exit blocks
++  // Each exit block only needs a 'ret' instruction
++  for (const BasicBlock *Succ : ExitBlocks) {
++    printBasicBlock(Succ, true);
++    Out << "  ret void\n";
++  }
++
++  Out << "}\n";
++
++  // Output metadata
++  if (!Machine.mdn_empty()) {
++    Out << '\n';
++    writeAllMDNodes();
++  }
++}
++#endif
++
+ /// printArgument - This member is called for every argument that is passed into
+ /// the function.  Simply print it out
+ void AssemblyWriter::printArgument(const Argument *Arg, AttributeSet Attrs) {
+@@ -4078,13 +4214,17 @@ void AssemblyWriter::printArgument(const Argument *Arg, AttributeSet Attrs) {
+   } else {
+     int Slot = Machine.getLocalSlot(Arg);
+     assert(Slot != -1 && "expect argument in function here");
+-    Out << " %" << Slot;
++    // By default, UnnamedVariablePrefix is empty so it matches original behaviour
++    // unless specified.
++    Out << " %" << UnnamedVariablePrefix << Slot;
+   }
+ }
+ 
+ /// printBasicBlock - This member is called for each basic block in a method.
+-void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
+-  bool IsEntryBlock = BB->getParent() && BB->isEntryBlock();
++void AssemblyWriter::printBasicBlock(const BasicBlock *BB,
++                                     bool PrintLabelOnly) {
++  assert(BB && BB->getParent() && "block without parent!");
++  bool IsEntryBlock = BB == &BB->getParent()->getEntryBlock();
+   if (BB->hasName()) {              // Print out the label if it exists...
+     Out << "\n";
+     PrintLLVMName(Out, BB->getName(), LabelPrefix);
+@@ -4092,12 +4232,19 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
+   } else if (!IsEntryBlock) {
+     Out << "\n";
+     int Slot = Machine.getLocalSlot(BB);
+-    if (Slot != -1)
+-      Out << Slot << ":";
+-    else
++    if (Slot != -1) {
++      // By default, UnnamedVariablePrefix is empty so it matches original behaviour
++      // unless specified.
++      Out << UnnamedVariablePrefix << Slot << ":";
++    } else
+       Out << "<badref>:";
+   }
+ 
++  if (PrintLabelOnly) {
++    Out << "\n";
++    return;
++  }
++
+   if (!IsEntryBlock) {
+     // Output predecessors for the block.
+     Out.PadToColumn(50);
+@@ -4191,8 +4338,11 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
+     int SlotNum = Machine.getLocalSlot(&I);
+     if (SlotNum == -1)
+       Out << "<badref> = ";
+-    else
+-      Out << '%' << SlotNum << " = ";
++    else {
++      // By default, UnnamedVariablePrefix is empty so it matches original behaviour
++      // unless specified.
++      Out << '%' << UnnamedVariablePrefix << SlotNum << " = ";
++    }
+   }
+ 
+   if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
+@@ -4762,6 +4912,20 @@ void BasicBlock::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
+   W.printBasicBlock(this);
+ }
+ 
++#if defined(ENABLE_ACPO)
++void Loop::printWithFunctionWrapper(
++    raw_ostream &ROS, Function *F, ArrayRef<BasicBlock *> LoopBlocks,
++    BasicBlock *Header, SmallVector<BasicBlock *, 8> ExitBlocks,
++    AssemblyAnnotationWriter *AAW, bool ShouldPreserveUseListOrder,
++    bool IsForDebug) const {
++  SlotTracker SlotTable(F);
++  formatted_raw_ostream OS(ROS);
++  AssemblyWriter W(OS, SlotTable, F->getParent(), AAW, IsForDebug,
++                   ShouldPreserveUseListOrder);
++  W.printLoopWithFunctionWrapper(F, LoopBlocks, Header, ExitBlocks);
++}
++#endif
++
+ void Module::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
+                    bool ShouldPreserveUseListOrder, bool IsForDebug) const {
+   SlotTracker SlotTable(this);
+diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
+index a3ccbc6d258f..e2fe3322aef4 100644
+--- a/llvm/lib/Passes/PassBuilder.cpp
++++ b/llvm/lib/Passes/PassBuilder.cpp
+@@ -267,6 +267,12 @@
+ #include "llvm/Transforms/Scalar/AutoTuningCompile.h"
+ #endif
+ 
++#if defined(ENABLE_ACPO)
++#include "llvm/Analysis/CallHeight.h"
++#include "llvm/Analysis/DumpCallsite.h"
++#include "llvm/Analysis/DumpFeature.h"
++#endif
++
+ using namespace llvm;
+ 
+ static const Regex DefaultAliasRegex(
+diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
+index 8009e011833c..de89f5393ba2 100644
+--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
++++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
+@@ -138,6 +138,12 @@
+ #include "llvm/Transforms/Scalar/AutoTuningCompile.h"
+ #endif
+ 
++#if defined(ENABLE_ACPO)
++#include "llvm/Analysis/CallHeight.h"
++#include "llvm/Analysis/DumpCallsite.h"
++#include "llvm/Analysis/DumpFeature.h"
++#endif
++
+ using namespace llvm;
+ 
+ static cl::opt<InliningAdvisorMode> UseInlineAdvisor(
+@@ -894,6 +900,14 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
+   // make a lot of sense and we should revisit the core CGSCC structure.
+   CGSCCPassManager &MainCGPipeline = MIWP.getPM();
+ 
++#if defined(ENABLE_ACPO)
++  if (EnableFeatureDump) {
++    // Add CallHeight analysis for dump feature
++    MIWP.addModulePass(RequireAnalysisPass<CallHeightAnalysis, Module>());
++    MainCGPipeline.addPass(DumpFeaturePass());
++  }
++#endif
++
+   // Note: historically, the PruneEH pass was run first to deduce nounwind and
+   // generally clean up exception handling overhead. It isn't clear this is
+   // valuable as the inliner doesn't currently care whether it is inlining an
+diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
+index 45a539f14b93..6ef0d6791ff2 100644
+--- a/llvm/lib/Passes/PassRegistry.def
++++ b/llvm/lib/Passes/PassRegistry.def
+@@ -33,6 +33,10 @@ MODULE_ANALYSIS("ir-similarity", IRSimilarityAnalysis())
+ MODULE_ANALYSIS("autotuning-dump", AutotuningDumpAnalysis())
+ #endif
+ 
++#if defined(ENABLE_ACPO)
++MODULE_ANALYSIS("call-height", CallHeightAnalysis())
++#endif
++
+ #ifndef MODULE_ALIAS_ANALYSIS
+ #define MODULE_ALIAS_ANALYSIS(NAME, CREATE_PASS)                               \
+   MODULE_ANALYSIS(NAME, CREATE_PASS)
+@@ -215,6 +219,9 @@ CGSCC_PASS("invalidate<all>", InvalidateAllAnalysesPass())
+ CGSCC_PASS("attributor-cgscc", AttributorCGSCCPass())
+ CGSCC_PASS("openmp-opt-cgscc", OpenMPOptCGSCCPass())
+ CGSCC_PASS("no-op-cgscc", NoOpCGSCCPass())
++#if defined(ENABLE_ACPO)
++CGSCC_PASS("dump-feature", DumpFeaturePass())
++#endif
+ #undef CGSCC_PASS
+ 
+ #ifndef CGSCC_PASS_WITH_PARAMS
+@@ -325,6 +332,9 @@ FUNCTION_PASS("view-dom", DomViewer())
+ FUNCTION_PASS("view-dom-only", DomOnlyViewer())
+ FUNCTION_PASS("view-post-dom", PostDomViewer())
+ FUNCTION_PASS("view-post-dom-only", PostDomOnlyViewer())
++#if defined(ENABLE_ACPO)
++FUNCTION_PASS("dump-callsite", DumpCallsitePass())
++#endif
+ FUNCTION_PASS("fix-irreducible", FixIrreduciblePass())
+ FUNCTION_PASS("flattencfg", FlattenCFGPass())
+ FUNCTION_PASS("make-guards-explicit", MakeGuardsExplicitPass())
+-- 
+2.38.1.windows.1
+
diff --git a/0032-ACPO-Introduce-MLInliner-using-ACPO-infrastructure.patch b/0032-ACPO-Introduce-MLInliner-using-ACPO-infrastructure.patch
new file mode 100644
index 0000000..31ac7a9
--- /dev/null
+++ b/0032-ACPO-Introduce-MLInliner-using-ACPO-infrastructure.patch
@@ -0,0 +1,1748 @@
+From 35db41f1fc006aa06fb012ec942d17c93bf0f8d5 Mon Sep 17 00:00:00 2001
+From: Amir Ashouri <amirh.ashouri@huawei.com>
+Date: Thu, 22 Aug 2024 23:58:33 -0400
+Subject: [PATCH] [ACPO] Introduce MLInliner using ACPO infrastructure
+
+This change adds ML model to the inliner for performance optimization.
+---
+ .../llvm/Analysis/ACPOCollectFeatures.h       |   3 +
+ llvm/include/llvm/Analysis/ACPOFIModel.h      | 144 +++++++++
+ llvm/include/llvm/Analysis/ACPOMLInterface.h  |   4 +-
+ llvm/include/llvm/Analysis/ACPOModel.h        |   2 +
+ llvm/include/llvm/Analysis/ACPOModelRunner.h  |   2 +
+ llvm/include/llvm/Analysis/AOTModelRunner.h   |   2 +
+ llvm/include/llvm/Analysis/CallHeight.h       |   2 +
+ llvm/include/llvm/Analysis/DumpCallsite.h     |   2 +
+ llvm/include/llvm/Analysis/DumpFeature.h      |   2 +
+ llvm/include/llvm/Analysis/FIModelRunner.h    | 277 ++++++++++++++++++
+ llvm/include/llvm/Analysis/InlineAdvisor.h    |  30 ++
+ .../llvm/Analysis/InlineModelFeatureMaps.h    |  30 ++
+ llvm/include/llvm/Analysis/MLInlineAdvisor.h  |   5 +-
+ llvm/include/llvm/InitializePasses.h          |   6 +
+ llvm/include/llvm/Transforms/IPO.h            |   4 +
+ llvm/include/llvm/Transforms/IPO/Inliner.h    |   9 +
+ llvm/lib/Analysis/ACPOCollectFeatures.cpp     |   2 +
+ llvm/lib/Analysis/ACPOFIModel.cpp             | 243 +++++++++++++++
+ llvm/lib/Analysis/ACPOMLInterface.cpp         |   2 +
+ llvm/lib/Analysis/ACPOModel.cpp               |   2 +
+ llvm/lib/Analysis/CallHeight.cpp              |   3 +
+ llvm/lib/Analysis/DumpCallsite.cpp            |   2 +
+ llvm/lib/Analysis/DumpFeature.cpp             |   3 +
+ llvm/lib/Analysis/InlineAdvisor.cpp           | 115 ++++++++
+ llvm/lib/Analysis/MLInlineAdvisor.cpp         |   5 +
+ llvm/lib/IR/AsmWriter.cpp                     |  35 ++-
+ llvm/lib/Transforms/IPO/Inliner.cpp           | 219 +++++++++++++-
+ llvm/tools/opt/opt.cpp                        |   3 +
+ 28 files changed, 1151 insertions(+), 7 deletions(-)
+ create mode 100644 llvm/include/llvm/Analysis/ACPOFIModel.h
+ create mode 100644 llvm/include/llvm/Analysis/FIModelRunner.h
+ create mode 100644 llvm/lib/Analysis/ACPOFIModel.cpp
+
+diff --git a/llvm/include/llvm/Analysis/ACPOCollectFeatures.h b/llvm/include/llvm/Analysis/ACPOCollectFeatures.h
+index ec62b559542d..8b266b3bc756 100644
+--- a/llvm/include/llvm/Analysis/ACPOCollectFeatures.h
++++ b/llvm/include/llvm/Analysis/ACPOCollectFeatures.h
+@@ -10,6 +10,8 @@
+ // collected on a given ACPOModel class from all available features.
+ //
+ //===----------------------------------------------------------------------===//
++
++#if defined(ENABLE_ACPO)
+ #ifndef LLVM_ANALYSIS_ACPOCOLLECTFEATURES_H
+ #define LLVM_ANALYSIS_ACPOCOLLECTFEATURES_H
+ #include "llvm/Analysis/InlineAdvisor.h"
+@@ -294,3 +296,4 @@ operator++(ACPOCollectFeatures::FeatureIndex &, int);
+ 
+ } // namespace llvm
+ #endif // LLVM_ANALYSIS_ACPOCOLLECTFEATURES_H
++#endif // ENABLE_ACPO
+diff --git a/llvm/include/llvm/Analysis/ACPOFIModel.h b/llvm/include/llvm/Analysis/ACPOFIModel.h
+new file mode 100644
+index 000000000000..8753dd3d7c63
+--- /dev/null
++++ b/llvm/include/llvm/Analysis/ACPOFIModel.h
+@@ -0,0 +1,144 @@
++//===- ACPOFIModel.h - AI-Enabled Continuous Program Optimization ---------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++
++#if defined(ENABLE_ACPO)
++#ifndef LLVM_ANALYSIS_ACPOFIMODEL_H
++#define LLVM_ANALYSIS_ACPOFIMODEL_H
++
++#include "llvm/Analysis/ACPOModel.h"
++#include "llvm/Analysis/DumpFeature.h"
++#include "llvm/Analysis/FunctionPropertiesAnalysis.h"
++#include "llvm/Analysis/InlineAdvisor.h"
++#include "llvm/Analysis/InlineSizeEstimatorAnalysis.h"
++#include "llvm/Analysis/LoopInfo.h"
++#include "llvm/Analysis/TargetLibraryInfo.h"
++#include "llvm/Analysis/TargetTransformInfo.h"
++
++#include <map>
++
++namespace llvm {
++
++//class ACPOmodel;
++
++class ACPOFIModel : public ACPOModel {
++public:
++  ACPOFIModel(CallBase *CB, InlineAdvisor *IA, OptimizationRemarkEmitter *ORE,
++              bool OnlyMandatory, bool UseML = true);
++
++  ~ACPOFIModel();
++
++  void setMLCustomFeatures(
++      std::vector<std::pair<std::string, std::string>> FeatureValues);
++
++  void sendCustomFeatures() override;
++
++  InlineAdvisor *getNotACPOAdvisor();
++
++  // Recorder's to micmic the behavior for default InlineAdvice.
++  // If the model is turned off or was decided to fall back to
++  // default inline advisor then we need to make sure the advice returned
++  // is properly recorded. Or else there would be an error.
++  void recordUnattemptedInlining();
++
++  void recordInlining();
++
++  void recordUnsuccessfulInlining(InlineResult &IR);
++
++  void recordInliningWithCalleeDeleted();
++
++  // Interface for IRToPerf Cache system.
++  struct FunctionFeaturesCache {
++    using FunctionSizeMap = DenseMap<const Function *, size_t>;
++    using FunctionFloatMap = DenseMap<const Function *, float>;
++
++    std::array<FunctionSizeMap,
++               static_cast<size_t>(
++                   ACPOFIExtendedFeatures::NamedFeatureIndex::NumNamedFeatures)>
++        NamedFeatures;
++    std::array<FunctionFloatMap,
++               static_cast<size_t>(
++                   ACPOFIExtendedFeatures::NamedFloatFeatureIndex::
++                       NumNamedFloatFeatures)>
++        NamedFloatFeatures;
++
++    FunctionSizeMap &operator[](ACPOFIExtendedFeatures::NamedFeatureIndex Pos) {
++      return NamedFeatures[static_cast<size_t>(Pos)];
++    }
++    FunctionFloatMap &
++    operator[](ACPOFIExtendedFeatures::NamedFloatFeatureIndex Pos) {
++      return NamedFloatFeatures[static_cast<size_t>(Pos)];
++    }
++  };
++
++  struct FunctionAnalysisMap {
++    DenseMap<const Function *, const DominatorTree *> DomCache;
++    DenseMap<const Function *, const LoopInfo *> LICache;
++    DenseMap<const Function *, const TargetTransformInfo *> TTICache;
++  };
++
++  // Invalidation mechanisms
++  static void invalidateCache(CallBase *CB);
++
++  static void invalidateCache(const Function *F);
++
++  static void clearCache();
++
++  // Getters/setters for the cache system.
++  static std::optional<size_t>
++  getCachedSize(const Function *F,
++                ACPOFIExtendedFeatures::NamedFeatureIndex idx);
++
++  static std::optional<float>
++  getCachedFloat(const Function *F,
++                 ACPOFIExtendedFeatures::NamedFloatFeatureIndex idx);
++
++  static void insertSizeCache(const Function *F,
++                              ACPOFIExtendedFeatures::NamedFeatureIndex idx,
++                              size_t val);
++
++  static void
++  insertFloatCache(const Function *F,
++                   ACPOFIExtendedFeatures::NamedFloatFeatureIndex idx,
++                   float val);
++
++  static const DominatorTree *getDomCachedAnalysis(const Function *F);
++
++  static const LoopInfo *getLICachedAnalysis(const Function *F);
++
++  static const TargetTransformInfo *getTTICachedAnalysis(const Function *F);
++
++  static void insertAnalysisCache(const Function *F, const DominatorTree *Tree);
++
++  static void insertAnalysisCache(const Function *F, const LoopInfo *LI);
++
++  static void insertAnalysisCache(const Function *F,
++                                  const TargetTransformInfo *TTI);
++
++protected:
++  // Interface to run the MLInference/default advisor and get advice from the
++  // model/default advisor
++  virtual std::unique_ptr<ACPOAdvice> getAdviceML() override;
++
++  virtual std::unique_ptr<ACPOAdvice> getAdviceNoML() override;
++
++private:
++  static FunctionFeaturesCache FeatureCache;
++  static FunctionAnalysisMap FunctionAnalysisCache;
++  CallBase *CurrentCB = nullptr;
++  InlineAdvisor *NotACPOAdvisor = nullptr;
++  bool ShouldInline = false;
++  bool OnlyMandatory = false;
++  std::unique_ptr<InlineAdvice> NotACPOAdvice = nullptr;
++  std::vector<std::pair<std::string, std::string>> CustomFeatureValues;
++};
++
++} // end namespace llvm
++
++#endif // LLVM_ANALYSIS_ACPOFIMODEL_H
++
++#endif // ENABLE_ACPO
+diff --git a/llvm/include/llvm/Analysis/ACPOMLInterface.h b/llvm/include/llvm/Analysis/ACPOMLInterface.h
+index 996f27ee32ba..fbc8a46b3d9a 100644
+--- a/llvm/include/llvm/Analysis/ACPOMLInterface.h
++++ b/llvm/include/llvm/Analysis/ACPOMLInterface.h
+@@ -4,10 +4,9 @@
+ // See https://llvm.org/LICENSE.txt for license information.
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ //
+-// Copyright (C) 2021-2022. Huawei Technologies Co., Ltd. All rights reserved.
+-//
+ //===----------------------------------------------------------------------===//
+ 
++#if defined(ENABLE_ACPO)
+ #ifndef LLVM_ANALYSIS_ACPOML_INTERFACE_H
+ #define LLVM_ANALYSIS_ACPOML_INTERFACE_H
+ 
+@@ -480,3 +479,4 @@ std::shared_ptr<ACPOMLInterface> createPersistentCompiledMLIF();
+ } // namespace llvm
+ 
+ #endif // LLVM_ANALYSIS_ACPOML_INTERFACE_H
++#endif // ENABLE_ACPO
+diff --git a/llvm/include/llvm/Analysis/ACPOModel.h b/llvm/include/llvm/Analysis/ACPOModel.h
+index 34dbc0fdb8bf..d61ac00efaaf 100644
+--- a/llvm/include/llvm/Analysis/ACPOModel.h
++++ b/llvm/include/llvm/Analysis/ACPOModel.h
+@@ -6,6 +6,7 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#if defined(ENABLE_ACPO)
+ #ifndef LLVM_ANALYSIS_ACPOMODEL_H
+ #define LLVM_ANALYSIS_ACPOMODEL_H
+ 
+@@ -120,3 +121,4 @@ private:
+ } // namespace llvm
+ 
+ #endif // LLVM_ANALYSIS_ACPOMODEL_H
++#endif // ENABLE_ACPO
+diff --git a/llvm/include/llvm/Analysis/ACPOModelRunner.h b/llvm/include/llvm/Analysis/ACPOModelRunner.h
+index 819e17f71103..044f3af15bbe 100644
+--- a/llvm/include/llvm/Analysis/ACPOModelRunner.h
++++ b/llvm/include/llvm/Analysis/ACPOModelRunner.h
+@@ -6,6 +6,7 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#if defined(ENABLE_ACPO)
+ #ifndef LLVM_ANALYSIS_ACPOMODEL_H
+ #define LLVM_ANALYSIS_ACPOMODEL_H
+ 
+@@ -37,3 +38,4 @@ protected:
+ } // namespace llvm
+ 
+ #endif // LLVM_ANALYSIS_ACPOMODEL_H
++#endif // ENABLE_ACPO
+diff --git a/llvm/include/llvm/Analysis/AOTModelRunner.h b/llvm/include/llvm/Analysis/AOTModelRunner.h
+index abc6258c4f09..fe19a33a5a08 100644
+--- a/llvm/include/llvm/Analysis/AOTModelRunner.h
++++ b/llvm/include/llvm/Analysis/AOTModelRunner.h
+@@ -6,6 +6,7 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#if defined(ENABLE_ACPO)
+ #ifndef LLVM_ANALYSIS_AOTMODEL_H
+ #define LLVM_ANALYSIS_AOTMODEL_H
+ 
+@@ -201,3 +202,4 @@ private:
+ } // namespace llvm
+ 
+ #endif // LLVM_ANALYSIS_AOTMODEL_H
++#endif // ENABLE_ACPO
+diff --git a/llvm/include/llvm/Analysis/CallHeight.h b/llvm/include/llvm/Analysis/CallHeight.h
+index c1251081f525..84e94075ea39 100644
+--- a/llvm/include/llvm/Analysis/CallHeight.h
++++ b/llvm/include/llvm/Analysis/CallHeight.h
+@@ -10,6 +10,7 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#if defined(ENABLE_ACPO)
+ #ifndef LLVM_ANALYSIS_CALLHEIGHT
+ #define LLVM_ANALYSIS_CALLHEIGHT
+ 
+@@ -70,3 +71,4 @@ Pass *createCallHeightAnalysisWrapper();
+ } // namespace llvm
+ 
+ #endif
++#endif // ENABLE_ACPO
+diff --git a/llvm/include/llvm/Analysis/DumpCallsite.h b/llvm/include/llvm/Analysis/DumpCallsite.h
+index 9f80fe1cb985..02238521580b 100644
+--- a/llvm/include/llvm/Analysis/DumpCallsite.h
++++ b/llvm/include/llvm/Analysis/DumpCallsite.h
+@@ -10,6 +10,7 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#if defined(ENABLE_ACPO)
+ #ifndef LLVM_ANALYSIS_DUMPCALLSITE
+ #define LLVM_ANALYSIS_DUMPCALLSITE
+ 
+@@ -25,3 +26,4 @@ public:
+ } // namespace llvm
+ 
+ #endif
++#endif // ENABLE_ACPO
+diff --git a/llvm/include/llvm/Analysis/DumpFeature.h b/llvm/include/llvm/Analysis/DumpFeature.h
+index 226e06cf5600..67ca36b106cb 100644
+--- a/llvm/include/llvm/Analysis/DumpFeature.h
++++ b/llvm/include/llvm/Analysis/DumpFeature.h
+@@ -10,6 +10,7 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#if defined(ENABLE_ACPO)
+ #ifndef LLVM_ANALYSIS_DUMPFEATURE
+ #define LLVM_ANALYSIS_DUMPFEATURE
+ 
+@@ -192,3 +193,4 @@ operator++(ACPOFIExtendedFeatures::NamedFloatFeatureIndex &n, int);
+ } // namespace llvm
+ 
+ #endif
++#endif // ENABLE_ACPO
+diff --git a/llvm/include/llvm/Analysis/FIModelRunner.h b/llvm/include/llvm/Analysis/FIModelRunner.h
+new file mode 100644
+index 000000000000..3685220aa074
+--- /dev/null
++++ b/llvm/include/llvm/Analysis/FIModelRunner.h
+@@ -0,0 +1,277 @@
++//===- FIModelRunner.h - AI-Enabled Continuous Program Optimization -------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++
++#if defined(ENABLE_ACPO)
++#ifdef LLVM_HAVE_TF_AOT_FICOMPILEDMODEL
++
++#ifndef LLVM_ANALYSIS_FIMODELRUNNER_H
++#define LLVM_ANALYSIS_FIMODELRUNNER_H
++
++#include "llvm/Analysis/AOTModelRunner.h"
++#include "llvm/Analysis/FICompiledModel.h"
++
++namespace llvm {
++
++class FIModelRunner : public AOTModelRunner<FICompiledModel> {
++  std::vector<float> Means = {
++      0.40009943697174110699421589742996729910373687744141,
++      0.0,
++      47.2218788212687599070704891346395015716552734375,
++      0.0,
++      0.07675459224122871404460966004990041255950927734375,
++      5816.8243862454482950852252542972564697265625,
++      1333.68016232413765465025790035724639892578125,
++      321.9700210967629345759632997214794158935546875,
++      0.94076781467098458122677584469784051179885864257812,
++      0.0,
++      0.0,
++      24.57427538666200916850357316434383392333984375,
++      0.72785175828753412297089653293369337916374206542969,
++      22.362582136282401990001744707114994525909423828125,
++      2.3236404681600126842511144786840304732322692871094,
++      219.476437468925951179699040949344635009765625,
++      123.872156304169635632206336595118045806884765625,
++      759.6211988873809559663641266524791717529296875,
++      3.5118047810371009198604497214546427130699157714844,
++      0.0,
++      14.689125089022963877027905255090445280075073242188,
++      0.2720138674263292699606608948670327663421630859375,
++      97.33707789677367827607668004930019378662109375,
++      5.4576519437240493815011177503038197755813598632812,
++      222416123463299168.0,
++      697004967939498496.0,
++      6.2712796684314486839184610289521515369415283203125,
++      1.4856427516360068974421437815180979669094085693359,
++      0.0041427067953076499376430241738944459939375519752502,
++      0.72785175828753412297089653293369337916374206542969,
++      552.7808652140716958456323482096195220947265625,
++      62.5524652090595196796130039729177951812744140625,
++      385.68509386043888298445381224155426025390625,
++      92.9494483935554143272383953444659709930419921875,
++      24.2728066757145342080548289231956005096435546875,
++      0.90531987798814816947867711860453709959983825683594,
++      0.0,
++      0.0,
++      2.9322753597871509256833633116912096738815307617188,
++      0.49584111584407208894731411419343203306198120117188,
++      7.9963853317029256473347231803927570581436157226562,
++      1.4571144465795025091381376114441081881523132324219,
++      15.557169540036818844441768305841833353042602539062,
++      9.6481678066085265754736610688269138336181640625,
++      50.98738225453177363988288561813533306121826171875,
++      1.3425469302194332765765238946187309920787811279297,
++      0.0,
++      839.271140434566405019722878932952880859375,
++      0.16440693908813608370422798543586395680904388427734,
++      2.8829196844891762374629706755513325333595275878906,
++      132.0555906421747067724936641752719879150390625,
++      92791372484119440.0,
++      166968642875823456.0,
++      5.5557876796248262252220229129306972026824951171875,
++      1.1750766644405326033506753446999937295913696289062,
++      0.0042161570432282073628282859090177225880324840545654,
++      0.49584111584407208894731411419343203306198120117188,
++      41.15953665944181949498670292086899280548095703125,
++      5.14903426051142787400749512016773223876953125,
++      2.0527687821658449074391228350577875971794128417969,
++      0.52614251736787642776960183255141600966453552246094,
++      0.74523979091361081117383946548216044902801513671875,
++      222.345100041656024814074044115841388702392578125,
++      7.4997648449992606600744693423621356487274169921875,
++      0.0,
++      78.5584998454695693226312869228422641754150390625,
++      0.0,
++      10.409640011287439875786731136031448841094970703125,
++      8.4653112780338357623577394406311213970184326171875,
++      1.3630927585697201198655648113344796001911163330078,
++      566.7381985783200661899172700941562652587890625,
++      0.0,
++      1.2066945269353257508271326514659449458122253417969,
++      55.41075531786237462483768467791378498077392578125,
++      0.51243634018194272883306439325679093599319458007812,
++      1.1147556403606606600931172579294070601463317871094,
++      -31.471868743197301654390685143880546092987060546875,
++      0.0,
++      0.030368588666872708276001091576290491502732038497925,
++      0.58478345583789081985059965518303215503692626953125,
++      0.00034937314395517275094141251834400918596656993031502,
++      -0.23764092503258577027125397762574721127748489379883,
++      -62.20223330063559075142620713450014591217041015625,
++      5.8952014942420616350204909394960850477218627929688,
++      3339.09353794057960840291343629360198974609375,
++      0.71960117711874660439974604742019437253475189208984,
++      -49.2720273048549444183663581497967243194580078125,
++      27818.32155766672440222464501857757568359375,
++      91.64824843118020680776680819690227508544921875,
++      106.3296335613216996307528461329638957977294921875,
++      469.83727273948858282892615534365177154541015625,
++      0.30689743210739195422576131022651679813861846923828,
++      1071.964175815315911677316762506961822509765625,
++      1363.988766309679022015188820660114288330078125,
++      14.079536139964256236112305487040430307388305664062,
++      63165365211952664.0,
++      0.38502264206721403816402471420587971806526184082031,
++      0.015573979763232508391479491649533883901312947273254,
++      0.13859363872129429329227434664062457159161567687988,
++      0.0};
++
++  std::vector<float> Scales = {
++      0.48991823553184549178141082848014775663614273071289,
++      1.0,
++      19.2517211876445770712962257675826549530029296875,
++      1.0,
++      0.26620166192402217042456413764739409089088439941406,
++      13580.447773648038491955958306789398193359375,
++      3192.7079136089387247920967638492584228515625,
++      633.0586155859824657454737462103366851806640625,
++      0.23605875020885080939336830851971171796321868896484,
++      1.0,
++      1.0,
++      101.565906032925312274528550915420055389404296875,
++      0.44506581113952026207414292002795264124870300292969,
++      25.4451961539476627649492002092301845550537109375,
++      1.8819488669919737233726664271671324968338012695312,
++      399.4446922340151786556816659867763519287109375,
++      253.61174866766344848656444810330867767333984375,
++      1934.51814232197148157865740358829498291015625,
++      9.2671206485376131922748754732310771942138671875,
++      1.0,
++      101.7363052307218964642743230797350406646728515625,
++      0.44499699252253444026194983962341211736202239990234,
++      241.819662633324895750774885527789592742919921875,
++      41.0624051346520815286567085422575473785400390625,
++      1810657384453411584.0,
++      2590019375355715584.0,
++      18.6007475145233769353581010363996028900146484375,
++      0.30589376767499054654564361044322140514850616455078,
++      0.021661308027730186848147653222440567333251237869263,
++      0.44506581113952026207414292002795264124870300292969,
++      2210.9835111177717408281750977039337158203125,
++      252.28469071093292086516157723963260650634765625,
++      1479.28580699818076027440838515758514404296875,
++      358.2883493183543350824038498103618621826171875,
++      86.4399992258764626740230596624314785003662109375,
++      0.29277260204409949473358665272826328873634338378906,
++      1.0,
++      1.0,
++      11.300678128510535103146139590535312891006469726562,
++      0.49998270338340455865022704529110342264175415039062,
++      9.4889928089799600030573856201954185962677001953125,
++      1.0885854822898506366612991769216023385524749755859,
++      53.20529981175358358314042561687529087066650390625,
++      36.65171139901388386306280153803527355194091796875,
++      214.68561782216193023486994206905364990234375,
++      2.8728217196022858281878598063485696911811828613281,
++      1.0,
++      1653.1016242378727838513441383838653564453125,
++      0.37064443536603375317639574859640561044216156005859,
++      20.0905336391907667348277755081653594970703125,
++      288.66579115116110187955200672149658203125,
++      967784087203564544.0,
++      986920622098821248.0,
++      17.499765511468584833210115903057157993316650390625,
++      0.57797196338014200645005757905892096459865570068359,
++      0.028955889395889600895772630906321865040808916091919,
++      0.49998270338340455865022704529110342264175415039062,
++      319.19585661999855119574931450188159942626953125,
++      38.6813101625874224964718450792133808135986328125,
++      39.62777871280881214488545083440840244293212890625,
++      5.0871202966110988796799574629403650760650634765625,
++      0.69504605038799238680979897253564558923244476318359,
++      673.3477042973012203219695948064327239990234375,
++      56.94168682747444876213194220326840877532958984375,
++      1.0,
++      261.01902251155337353338836692273616790771484375,
++      1.0,
++      85.0611943221388884239786420948803424835205078125,
++      53.12927927294536090130350203253328800201416015625,
++      21.829518414441992035790462978184223175048828125,
++      1898.72146183866834689979441463947296142578125,
++      1.0,
++      9.7285926829767870316345579340122640132904052734375,
++      174.40267892003106453557847999036312103271484375,
++      0.98364895900708060327843895720434375107288360595703,
++      1.1152676652901183373955973365809768438339233398438,
++      18.12268289087599981712628505192697048187255859375,
++      1.0,
++      0.1715993516574435828747624555035145021975040435791,
++      0.49275933843630442821037718204024713486433029174805,
++      0.031531692879025040310292382628176710568368434906006,
++      23.13033056510358420609918539412319660186767578125,
++      210.58233961820729973624111153185367584228515625,
++      5.1604155410259560099461850768420845270156860351562,
++      2053.87275307550726211047731339931488037109375,
++      1.0834136602451556186110792623367160558700561523438,
++      3840.080091990574146620929241180419921875,
++      13192.047960544839952490292489528656005859375,
++      348.088713237990532434196211397647857666015625,
++      439.96013885313283253708505071699619293212890625,
++      897.3433304220051240918110124766826629638671875,
++      0.69288480487588777201324319321429356932640075683594,
++      2894.596744865002619917504489421844482421875,
++      3788.94162413956064483500085771083831787109375,
++      94.549943427633166947998688556253910064697265625,
++      649339661894085888.0,
++      0.48660066498392295919472871901234611868858337402344,
++      0.12382015553845396316212656984134810045361518859863,
++      0.50791641118256847242662388453027233481407165527344,
++      1.0};
++
++public:
++  FIModelRunner(LLVMContext &Ctx,
++                std::vector<std::pair<std::string, std::string>> Features,
++                StringRef DecisionName)
++      : AOTModelRunner<FICompiledModel>(
++            Ctx,
++            {{"input_1", "float32[" + std::to_string(Features.size()) + "]"}},
++            DecisionName) {}
++
++  // Features for this model are only floats so we only need to override the
++  // float method to handle feature scaling and the input type
++  bool setCustomFeature(int FeatureIndex, float FeatureValue) override {
++    // Scale the feature according to the constant mean and scale value
++    // Feature scaling is done to create a standard normal distribution:
++    // subtract mean, then divide by standard deviation ("scale")
++    float ScaledValue =
++        (FeatureValue - Means[FeatureIndex]) / Scales[FeatureIndex];
++    // Assuming the Buffer at index 0 is for feature input of shape:
++    // (Feature.size())
++    float *Location = getTensor<float>(0) + FeatureIndex;
++    *Location = ScaledValue;
++    return true;
++  }
++
++  // Outputs for this model are only int so we only need to override this
++  // method
++  int getModelResultI(std::string OutputName) override {
++    if (OutputName == "FI-ShouldInline") {
++      int Classes[] = {0, 1};
++      void *ResultUntyped = CompiledModel->result_data(0);
++      float *Result = reinterpret_cast<float *>(ResultUntyped);
++      float Max = Result[0];
++      int MaxClass = 0;
++      for (size_t I = 0; I < sizeof(Classes) / sizeof(int); ++I) {
++        if (Result[I] > Max) {
++          Max = Result[I];
++          MaxClass = I;
++        }
++      }
++
++      return Classes[MaxClass];
++    }
++    assert(false && "ModelRunner received invalid result name");
++  }
++};
++
++} // namespace llvm
++
++#endif // LLVM_ANALYSIS_FIMODELRUNNER_H
++
++#endif // LLVM_HAVE_TF_AOT_FICOMPILEDMODEL
++
++#endif // ENABLE_ACPO
+diff --git a/llvm/include/llvm/Analysis/InlineAdvisor.h b/llvm/include/llvm/Analysis/InlineAdvisor.h
+index 53c018d15cd7..adf36a385725 100644
+--- a/llvm/include/llvm/Analysis/InlineAdvisor.h
++++ b/llvm/include/llvm/Analysis/InlineAdvisor.h
+@@ -200,6 +200,22 @@ public:
+     return AnnotatedInlinePassName.c_str();
+   }
+ 
++#if defined(ENABLE_ACPO)
++  /// Helper functions used by getFeatures to retrieve certain information
++  ///{
++  CallBase *getInlinableCS(Instruction &I);
++  int64_t getLocalCalls(Function &F);
++  unsigned getCallLoopLevel(CallBase &CB) const;
++  uint64_t getCalleeBlockFreq(CallBase &CB) const;
++  unsigned getCallSiteHeight(CallBase *CB);
++  ///}
++
++  // Allow ACPO infrastructure to replicate Advisor behaviour
++  virtual bool isForcedToStop() const { return false; }
++  bool neverInline(CallBase &CB) const;
++  bool isCSInlinable(CallBase &CB) const;
++#endif
++
+ protected:
+   InlineAdvisor(Module &M, FunctionAnalysisManager &FAM,
+                 std::optional<InlineContext> IC = std::nullopt);
+@@ -213,6 +229,15 @@ protected:
+   const std::string AnnotatedInlinePassName;
+   std::unique_ptr<ImportedFunctionsInliningStatistics> ImportedFunctionsStats;
+ 
++#if defined(ENABLE_ACPO)
++  /// Map a function to its callheight
++  std::map<const Function *, unsigned> FunctionLevels;
++  // used by getORE() for legacy PM
++  static std::unique_ptr<OptimizationRemarkEmitter> ORE;
++
++  friend class ACPOCollectFeatures;
++#endif
++
+   enum class MandatoryInliningKind { NotMandatory, Always, Never };
+ 
+   static MandatoryInliningKind getMandatoryKind(CallBase &CB,
+@@ -389,6 +414,11 @@ void emitInlinedIntoBasedOnCost(OptimizationRemarkEmitter &ORE, DebugLoc DLoc,
+                                 bool ForProfileContext = false,
+                                 const char *PassName = nullptr);
+ 
++#if defined(ENABLE_ACPO)
++/// get call site location as string.
++std::string getCallSiteLocation(DebugLoc DLoc);
++#endif
++
+ /// Add location info to ORE message.
+ void addLocationToRemarks(OptimizationRemark &Remark, DebugLoc DLoc);
+ 
+diff --git a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
+index 77ae60059ce9..e7ece46342fd 100644
+--- a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
++++ b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
+@@ -72,6 +72,36 @@ enum class InlineCostFeatureIndex : size_t {
+ 
+   NumberOfFeatures
+ };
++
++#if defined(ENABLE_ACPO)
++const std::map<InlineCostFeatureIndex, std::string> InlineCostFeatureIndexToName = {
++  { InlineCostFeatureIndex::sroa_savings, "sroa_savings" },
++  { InlineCostFeatureIndex::sroa_losses, "sroa_losses" },
++  { InlineCostFeatureIndex::load_elimination, "load_elimination" },
++  { InlineCostFeatureIndex::call_penalty, "call_penalty" },
++  { InlineCostFeatureIndex::call_argument_setup, "call_argument_setup" },
++  { InlineCostFeatureIndex::load_relative_intrinsic, "load_relative_intrinsic" },
++  { InlineCostFeatureIndex::lowered_call_arg_setup, "lowered_call_arg_setup" },
++  { InlineCostFeatureIndex::indirect_call_penalty, "indirect_call_penalty" },
++  { InlineCostFeatureIndex::jump_table_penalty, "jump_table_penalty" },
++  { InlineCostFeatureIndex::case_cluster_penalty, "case_cluster_penalty" },
++  { InlineCostFeatureIndex::switch_penalty, "switch_penalty" },
++  { InlineCostFeatureIndex::unsimplified_common_instructions, "unsimplified_common_instructions" },
++  { InlineCostFeatureIndex::num_loops, "num_loops" },
++  { InlineCostFeatureIndex::dead_blocks, "dead_blocks" },
++  { InlineCostFeatureIndex::simplified_instructions, "simplified_instructions" },
++  { InlineCostFeatureIndex::constant_args, "constant_args" },
++  { InlineCostFeatureIndex::constant_offset_ptr_args, "constant_offset_ptr_args" },
++  { InlineCostFeatureIndex::callsite_cost, "callsite_cost" },
++  { InlineCostFeatureIndex::cold_cc_penalty, "cold_cc_penalty" },
++  { InlineCostFeatureIndex::last_call_to_static_bonus, "last_call_to_static_bonus" },
++  { InlineCostFeatureIndex::is_multiple_blocks, "is_multiple_blocks" },
++  { InlineCostFeatureIndex::nested_inlines, "nested_inlines" },
++  { InlineCostFeatureIndex::nested_inline_cost_estimate, "nested_inline_cost_estimate" },
++  { InlineCostFeatureIndex::threshold, "threshold" }
++};
++#endif
++
+ // clang-format on
+ 
+ using InlineCostFeatures =
+diff --git a/llvm/include/llvm/Analysis/MLInlineAdvisor.h b/llvm/include/llvm/Analysis/MLInlineAdvisor.h
+index f58862e53352..e302b0a979a5 100644
+--- a/llvm/include/llvm/Analysis/MLInlineAdvisor.h
++++ b/llvm/include/llvm/Analysis/MLInlineAdvisor.h
+@@ -41,8 +41,11 @@ public:
+   }
+   void onSuccessfulInlining(const MLInlineAdvice &Advice,
+                             bool CalleeWasDeleted);
+-
++#if defined(ENABLE_ACPO)
++  bool isForcedToStop() const override { return ForceStop; }
++#else
+   bool isForcedToStop() const { return ForceStop; }
++#endif
+   int64_t getLocalCalls(Function &F);
+   const MLModelRunner &getModelRunner() const { return *ModelRunner.get(); }
+   FunctionPropertiesInfo &getCachedFPI(Function &) const;
+diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
+index 7fdb5db67c16..89160cfd17d1 100644
+--- a/llvm/include/llvm/InitializePasses.h
++++ b/llvm/include/llvm/InitializePasses.h
+@@ -100,7 +100,9 @@ void initializeDomPrinterWrapperPassPass(PassRegistry &);
+ void initializeDomViewerWrapperPassPass(PassRegistry &);
+ void initializeDominanceFrontierWrapperPassPass(PassRegistry&);
+ void initializeDominatorTreeWrapperPassPass(PassRegistry&);
++#if defined(ENABLE_ACPO)
+ void initializeDumpCallsiteLegacyPass(PassRegistry &);
++#endif
+ void initializeDwarfEHPrepareLegacyPassPass(PassRegistry &);
+ void initializeEarlyCSELegacyPassPass(PassRegistry&);
+ void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry&);
+@@ -134,7 +136,9 @@ void initializeGlobalsAAWrapperPassPass(PassRegistry&);
+ void initializeGuardWideningLegacyPassPass(PassRegistry&);
+ void initializeHardwareLoopsLegacyPass(PassRegistry&);
+ void initializeMIRProfileLoaderPassPass(PassRegistry &);
++#if defined(ENABLE_ACPO)
+ void initializeInlineAdvisorAnalysisWrapperPass(PassRegistry &);
++#endif
+ void initializeIRSimilarityIdentifierWrapperPassPass(PassRegistry&);
+ void initializeIRTranslatorPass(PassRegistry&);
+ void initializeIVUsersWrapperPassPass(PassRegistry&);
+@@ -152,11 +156,13 @@ void initializeInterleavedLoadCombinePass(PassRegistry &);
+ void initializeIntervalPartitionPass(PassRegistry&);
+ void initializeJMCInstrumenterPass(PassRegistry&);
+ void initializeKCFIPass(PassRegistry &);
++#if defined(ENABLE_ACPO)
+ void initializeLegacyFAMPass(PassRegistry &);
+ void initializeLegacyFunctionPropertiesAnalysisPass(PassRegistry &);
+ void initializeLegacyInlinerPassPass(PassRegistry &);
+ void initializeLegacyInlineSizeEstimatorAnalysisPass(PassRegistry &);
+ void initializeLegacyModuleInlinerWrapperPassPass(PassRegistry &);
++#endif
+ void initializeLCSSAVerificationPassPass(PassRegistry&);
+ void initializeLCSSAWrapperPassPass(PassRegistry&);
+ void initializeLazyBlockFrequencyInfoPassPass(PassRegistry&);
+diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h
+index 4995b000c454..6905acb261fe 100644
+--- a/llvm/include/llvm/Transforms/IPO.h
++++ b/llvm/include/llvm/Transforms/IPO.h
+@@ -18,6 +18,10 @@
+ #include <functional>
+ #include <vector>
+ 
++#if defined(ENABLE_ACPO)
++#include "llvm/Analysis/InlineAdvisor.h"
++#endif
++
+ namespace llvm {
+ 
+ class ModulePass;
+diff --git a/llvm/include/llvm/Transforms/IPO/Inliner.h b/llvm/include/llvm/Transforms/IPO/Inliner.h
+index 401aa2d3a0cc..46a3468c927c 100644
+--- a/llvm/include/llvm/Transforms/IPO/Inliner.h
++++ b/llvm/include/llvm/Transforms/IPO/Inliner.h
+@@ -16,6 +16,15 @@
+ #include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h"
+ #include "llvm/IR/PassManager.h"
+ 
++#if defined(ENABLE_ACPO)
++#include "llvm/ADT/STLExtras.h"
++#include "llvm/Analysis/BasicAliasAnalysis.h"
++#include "llvm/IR/LegacyPassManager.h"
++#include "llvm/IR/Module.h"
++#include "llvm/Pass.h"
++#include <utility>
++#endif
++
+ namespace llvm {
+ 
+ /// The inliner pass for the new pass manager.
+diff --git a/llvm/lib/Analysis/ACPOCollectFeatures.cpp b/llvm/lib/Analysis/ACPOCollectFeatures.cpp
+index f9de26483c76..daa924f2cb3b 100644
+--- a/llvm/lib/Analysis/ACPOCollectFeatures.cpp
++++ b/llvm/lib/Analysis/ACPOCollectFeatures.cpp
+@@ -10,6 +10,7 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#if defined(ENABLE_ACPO)
+ #include "llvm/Analysis/ACPOCollectFeatures.h"
+ #include "llvm/ADT/SCCIterator.h"
+ // The ACPOFIModel.h currently contains only the cache system for
+@@ -1256,3 +1257,4 @@ operator++(ACPOCollectFeatures::FeatureIndex &N, int) {
+ }
+ 
+ } // namespace llvm
++#endif // ENABLE_ACPO
+diff --git a/llvm/lib/Analysis/ACPOFIModel.cpp b/llvm/lib/Analysis/ACPOFIModel.cpp
+new file mode 100644
+index 000000000000..d9a647ec1012
+--- /dev/null
++++ b/llvm/lib/Analysis/ACPOFIModel.cpp
+@@ -0,0 +1,243 @@
++//===- ACPOFIModel.cpp - AI-Enabled Continuous Program Optimization -------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++//
++// This file implements the interface between ACPO and ML-guided optimizations.
++// It delegates decision making to inference with a pre-trained model.
++//
++//===----------------------------------------------------------------------===//
++
++#if defined(ENABLE_ACPO)
++#include "llvm/Analysis/ACPOFIModel.h"
++#include "llvm/Support/CommandLine.h"
++#include "llvm/Support/Process.h"
++
++using namespace llvm;
++
++#define DEBUG_TYPE "acpo"
++#define ACPO_ENV_VAR_DIR "ACPO_DIR"
++
++cl::opt<bool>
++    EnableACPOFI("enable-acpo-fi", cl::init(false), cl::Hidden,
++                 cl::desc("Leverage ACPO ML model to decide inlining."));
++
++cl::opt<bool>
++    EnableAOTFI("enable-acpo-fi-aot", cl::init(false), cl::Hidden,
++                cl::desc("Leverage AOT ML model to decide inlining."));
++
++ACPOFIModel::ACPOFIModel(CallBase *CB, InlineAdvisor *IA,
++                         OptimizationRemarkEmitter *ORE, bool OnlyMandatory,
++                         bool UseML)
++    : ACPOModel(ORE, UseML), CurrentCB(CB), NotACPOAdvisor(IA),
++      OnlyMandatory(OnlyMandatory) {
++  Function *Caller = CB->getCaller();
++  LLVMContext *Context = &(Caller->getContext());
++  setContextPtr(Context);
++  if (EnableACPOFI)
++    // ACPO Python support
++    setMLIF(createPersistentPythonMLIF());
++  else if (EnableAOTFI)
++    // ACPO AOT support
++    setMLIF(createPersistentCompiledMLIF());
++}
++
++ACPOFIModel::~ACPOFIModel() {}
++
++void ACPOFIModel::setMLCustomFeatures(
++    std::vector<std::pair<std::string, std::string>> FeatureValues) {
++  CustomFeatureValues = FeatureValues;
++}
++
++void ACPOFIModel::sendCustomFeatures() {
++  // Get an ACPOMLInterface to communicate with the Python side
++  std::shared_ptr<ACPOMLInterface> MLIF = getMLIF();
++  MLIF->initializeFeatures("FI", CustomFeatureValues);
++}
++
++void ACPOFIModel::recordUnattemptedInlining() {
++  if (NotACPOAdvice)
++    NotACPOAdvice->recordUnattemptedInlining();
++}
++
++void ACPOFIModel::recordInlining() {
++  if (NotACPOAdvice)
++    NotACPOAdvice->recordInlining();
++}
++
++void ACPOFIModel::recordUnsuccessfulInlining(InlineResult &IR) {
++  if (NotACPOAdvice)
++    NotACPOAdvice->recordUnsuccessfulInlining(IR);
++}
++
++void ACPOFIModel::recordInliningWithCalleeDeleted() {
++  if (NotACPOAdvice)
++    NotACPOAdvice->recordInliningWithCalleeDeleted();
++}
++
++void ACPOFIModel::invalidateCache(CallBase *CB) {
++  if (CB) {
++    invalidateCache(CB->getCaller());
++  }
++}
++
++InlineAdvisor *ACPOFIModel::getNotACPOAdvisor() { return NotACPOAdvisor; }
++
++void ACPOFIModel::invalidateCache(const Function *F) {
++  for (ACPOFIExtendedFeatures::NamedFeatureIndex feature =
++           ACPOFIExtendedFeatures::NamedFeatureIndex(0);
++       feature != ACPOFIExtendedFeatures::NamedFeatureIndex::NumNamedFeatures;
++       ++feature) {
++    FeatureCache[feature].erase(F);
++  }
++  for (ACPOFIExtendedFeatures::NamedFloatFeatureIndex feature =
++           ACPOFIExtendedFeatures::NamedFloatFeatureIndex(0);
++       feature !=
++       ACPOFIExtendedFeatures::NamedFloatFeatureIndex::NumNamedFloatFeatures;
++       ++feature) {
++    FeatureCache[feature].erase(F);
++  }
++  FunctionAnalysisCache.DomCache.erase(F);
++  FunctionAnalysisCache.LICache.erase(F);
++  FunctionAnalysisCache.TTICache.erase(F);
++}
++
++void ACPOFIModel::clearCache() {
++  for (ACPOFIExtendedFeatures::NamedFeatureIndex feature =
++           ACPOFIExtendedFeatures::NamedFeatureIndex(0);
++       feature != ACPOFIExtendedFeatures::NamedFeatureIndex::NumNamedFeatures;
++       ++feature) {
++    FeatureCache[feature].clear();
++  }
++  for (ACPOFIExtendedFeatures::NamedFloatFeatureIndex feature =
++           ACPOFIExtendedFeatures::NamedFloatFeatureIndex(0);
++       feature !=
++       ACPOFIExtendedFeatures::NamedFloatFeatureIndex::NumNamedFloatFeatures;
++       ++feature) {
++    FeatureCache[feature].clear();
++  }
++  FunctionAnalysisCache.DomCache.clear();
++  FunctionAnalysisCache.LICache.clear();
++  FunctionAnalysisCache.TTICache.clear();
++}
++
++std::optional<size_t>
++ACPOFIModel::getCachedSize(const Function *F,
++                           ACPOFIExtendedFeatures::NamedFeatureIndex idx) {
++  auto it = FeatureCache[idx].find(F);
++  return it != FeatureCache[idx].end() ? std::optional<size_t>(it->second)
++                                       : std::nullopt;
++}
++
++std::optional<float> ACPOFIModel::getCachedFloat(
++    const Function *F, ACPOFIExtendedFeatures::NamedFloatFeatureIndex idx) {
++  auto it = FeatureCache[idx].find(F);
++  return it != FeatureCache[idx].end() ? std::optional<float>(it->second)
++                                       : std::nullopt;
++}
++
++void ACPOFIModel::insertSizeCache(const Function *F,
++                                  ACPOFIExtendedFeatures::NamedFeatureIndex idx,
++                                  size_t val) {
++  FeatureCache[idx].insert(std::make_pair(F, val));
++}
++
++void ACPOFIModel::insertFloatCache(
++    const Function *F, ACPOFIExtendedFeatures::NamedFloatFeatureIndex idx,
++    float val) {
++  FeatureCache[idx].insert(std::make_pair(F, val));
++}
++
++const DominatorTree *ACPOFIModel::getDomCachedAnalysis(const Function *F) {
++  auto it = FunctionAnalysisCache.DomCache.find(F);
++  return it != FunctionAnalysisCache.DomCache.end() ? it->second : nullptr;
++}
++
++const LoopInfo *ACPOFIModel::getLICachedAnalysis(const Function *F) {
++  auto it = FunctionAnalysisCache.LICache.find(F);
++  return it != FunctionAnalysisCache.LICache.end() ? it->second : nullptr;
++}
++
++const TargetTransformInfo *
++ACPOFIModel::getTTICachedAnalysis(const Function *F) {
++  auto it = FunctionAnalysisCache.TTICache.find(F);
++  return it != FunctionAnalysisCache.TTICache.end() ? it->second : nullptr;
++}
++
++void ACPOFIModel::insertAnalysisCache(const Function *F,
++                                      const DominatorTree *Tree) {
++  FunctionAnalysisCache.DomCache.insert(std::make_pair(F, Tree));
++}
++
++void ACPOFIModel::insertAnalysisCache(const Function *F, const LoopInfo *LI) {
++  FunctionAnalysisCache.LICache.insert(std::make_pair(F, LI));
++}
++
++void ACPOFIModel::insertAnalysisCache(const Function *F,
++                                      const TargetTransformInfo *TTI) {
++  FunctionAnalysisCache.TTICache.insert(std::make_pair(F, TTI));
++}
++
++std::unique_ptr<ACPOAdvice> ACPOFIModel::getAdviceML() {
++  std::shared_ptr<ACPOMLInterface> MLIF = getMLIF();
++  // Generate result.
++  std::unique_ptr<ACPOAdvice> Advice = std::make_unique<ACPOAdvice>();
++  // handle mandatory case, forcestop, never inline  or not inlinable cases
++  if (OnlyMandatory)
++    return getAdviceNoML();
++  if (NotACPOAdvisor->neverInline(*CurrentCB) ||
++      !NotACPOAdvisor->isCSInlinable(*CurrentCB)) {
++    Advice->addField("FI-ShouldInline",
++                     ConstantInt::get(Type::getInt64Ty(*(getContextPtr())),
++                                      (int64_t) false));
++    NotACPOAdvice = nullptr;
++    return Advice;
++  }
++  std::optional<std::string> Env = llvm::sys::Process::GetEnv(ACPO_ENV_VAR_DIR);
++  if (!Env || *Env == "") {
++    std::optional<std::string> LLVMDIROpt =
++        llvm::sys::Process::GetEnv("LLVM_DIR");
++    if (!LLVMDIROpt) {
++      outs() << "ACPO_DIR not found. "
++             << "Did you export ACPO_DIR to $LLVM_DIR/acpo ?\n"
++             << "Falling back to default advisor. \n";
++      return getAdviceNoML();
++    }
++  }
++  assert(MLIF != nullptr);
++  if (!MLIF->loadModel("model-fi.acpo")) {
++    outs() << "Model not loaded correctly. \n";
++    return getAdviceNoML();
++  }
++  if (!MLIF->initializeFeatures("FI", CustomFeatureValues)) {
++    outs() << "Features not initialized correctly. \n";
++    return getAdviceNoML();
++  }
++  bool ModelRunOK = MLIF->runModel("FI");
++  assert(ModelRunOK);
++  ShouldInline = MLIF->getModelResultI("FI-ShouldInline");
++  assert(getContextPtr() != nullptr);
++  Advice->addField("FI-ShouldInline",
++                   ConstantInt::get(Type::getInt64Ty(*(getContextPtr())),
++                                    (int64_t)ShouldInline));
++  return Advice;
++}
++
++std::unique_ptr<ACPOAdvice> ACPOFIModel::getAdviceNoML() {
++  // Use the advisor used by default inlining
++  std::unique_ptr<ACPOAdvice> Advice = std::make_unique<ACPOAdvice>();
++  assert(getContextPtr() != nullptr);
++  NotACPOAdvice = NotACPOAdvisor->getAdvice(*CurrentCB, OnlyMandatory);
++  bool ShouldInline = NotACPOAdvice->isInliningRecommended();
++  Advice->addField("FI-ShouldInline",
++                   ConstantInt::get(Type::getInt64Ty(*(getContextPtr())),
++                                    (int64_t)ShouldInline));
++  return Advice;
++}
++
++ACPOFIModel::FunctionFeaturesCache ACPOFIModel::FeatureCache;
++ACPOFIModel::FunctionAnalysisMap ACPOFIModel::FunctionAnalysisCache;
++#endif // ENABLE_ACPO
+diff --git a/llvm/lib/Analysis/ACPOMLInterface.cpp b/llvm/lib/Analysis/ACPOMLInterface.cpp
+index 271dcfe7d851..f48eb46638e3 100644
+--- a/llvm/lib/Analysis/ACPOMLInterface.cpp
++++ b/llvm/lib/Analysis/ACPOMLInterface.cpp
+@@ -10,6 +10,7 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#if defined(ENABLE_ACPO)
+ #include "llvm/Analysis/ACPOMLInterface.h"
+ #include "llvm/Analysis/ACPOModelRunner.h"
+ #include "llvm/Analysis/FIModelRunner.h"
+@@ -1403,3 +1404,4 @@ const std::unordered_map<std::string,
+         {"FI", createFI},
+ #endif
+ };
++#endif
+diff --git a/llvm/lib/Analysis/ACPOModel.cpp b/llvm/lib/Analysis/ACPOModel.cpp
+index 2d0dae733943..fac760f37ef5 100644
+--- a/llvm/lib/Analysis/ACPOModel.cpp
++++ b/llvm/lib/Analysis/ACPOModel.cpp
+@@ -11,6 +11,7 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#if defined(ENABLE_ACPO)
+ #include "llvm/Analysis/ACPOModel.h"
+ #include "llvm/Analysis/LoopInfo.h"
+ #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+@@ -61,3 +62,4 @@ void ACPOModel::addFeature(int64_t ID, Constant *Val) {
+   assert(CustomFeatureMap.find(ID) == CustomFeatureMap.end());
+   CustomFeatureMap[ID] = Val;
+ }
++#endif
+diff --git a/llvm/lib/Analysis/CallHeight.cpp b/llvm/lib/Analysis/CallHeight.cpp
+index f7b88cbdff05..e42547344796 100644
+--- a/llvm/lib/Analysis/CallHeight.cpp
++++ b/llvm/lib/Analysis/CallHeight.cpp
+@@ -9,6 +9,8 @@
+ // This file implements getting the call height of functions in a module.
+ //
+ //===----------------------------------------------------------------------===//
++
++#if defined(ENABLE_ACPO)
+ #include "llvm/Analysis/CallHeight.h"
+ #include "llvm/ADT/SCCIterator.h"
+ #include "llvm/Analysis/CallGraph.h"
+@@ -87,3 +89,4 @@ INITIALIZE_PASS(CallHeightAnalysisWrapper, DEBUG_TYPE, "Call Height Analysis",
+ Pass *llvm::createCallHeightAnalysisWrapper() {
+   return new CallHeightAnalysisWrapper();
+ }
++#endif
+diff --git a/llvm/lib/Analysis/DumpCallsite.cpp b/llvm/lib/Analysis/DumpCallsite.cpp
+index d49885a372f2..64ff0d0db8f7 100644
+--- a/llvm/lib/Analysis/DumpCallsite.cpp
++++ b/llvm/lib/Analysis/DumpCallsite.cpp
+@@ -9,6 +9,7 @@
+ // This file implements the ability to dump all callsites in a given function.
+ //
+ //===----------------------------------------------------------------------===//
++#if defined(ENABLE_ACPO)
+ #include "llvm/Analysis/DumpCallsite.h"
+ #include "llvm/IR/Function.h"
+ #include "llvm/IR/InstIterator.h"
+@@ -80,3 +81,4 @@ PreservedAnalyses DumpCallsitePass::run(Function &F,
+   Impl.run(F);
+   return PreservedAnalyses::all();
+ }
++#endif
+diff --git a/llvm/lib/Analysis/DumpFeature.cpp b/llvm/lib/Analysis/DumpFeature.cpp
+index 81756226c2fd..52acbb2456ad 100644
+--- a/llvm/lib/Analysis/DumpFeature.cpp
++++ b/llvm/lib/Analysis/DumpFeature.cpp
+@@ -9,6 +9,8 @@
+ // This file implements dumping features for functions in an scc.
+ //
+ //===----------------------------------------------------------------------===//
++
++#if defined(ENABLE_ACPO)
+ #include "llvm/Analysis/DumpFeature.h"
+ #include "llvm/ADT/SCCIterator.h"
+ #include "llvm/Analysis/CallHeight.h"
+@@ -573,3 +575,4 @@ operator++(ACPOFIExtendedFeatures::NamedFloatFeatureIndex &n, int) {
+   ++n;
+   return res;
+ }
++#endif
+diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp
+index f6b3c14a0345..4df989ca2b64 100644
+--- a/llvm/lib/Analysis/InlineAdvisor.cpp
++++ b/llvm/lib/Analysis/InlineAdvisor.cpp
+@@ -27,6 +27,25 @@
+ #include "llvm/Support/CommandLine.h"
+ #include "llvm/Support/raw_ostream.h"
+ 
++#if defined(ENABLE_ACPO)
++#include "llvm/ADT/StringRef.h"
++#include <unordered_set>
++
++#include "llvm/ADT/SCCIterator.h"
++#include "llvm/ADT/StringRef.h"
++#include "llvm/Analysis/ACPOFIModel.h"
++#include "llvm/Analysis/DumpFeature.h"
++#include "llvm/Analysis/FunctionPropertiesAnalysis.h"
++#include "llvm/Analysis/InlineModelFeatureMaps.h"
++#include "llvm/Analysis/InlineSizeEstimatorAnalysis.h"
++#include "llvm/Analysis/LoopInfo.h"
++#include "llvm/IR/InstIterator.h"
++#include "llvm/IR/Instructions.h"
++#include "llvm/InitializePasses.h"
++#include "llvm/Transforms/IPO/Inliner.h"
++#include <memory>
++#endif
++
+ using namespace llvm;
+ #define DEBUG_TYPE "inline"
+ #ifdef LLVM_HAVE_TF_AOT_INLINERSIZEMODEL
+@@ -537,6 +556,46 @@ void llvm::emitInlinedIntoBasedOnCost(
+       PassName);
+ }
+ 
++#if defined(ENABLE_ACPO)
++CallBase *InlineAdvisor::getInlinableCS(Instruction &I) {
++  if (auto *CS = dyn_cast<CallBase>(&I))
++    if (Function *Callee = CS->getCalledFunction()) {
++      if (!Callee->isDeclaration()) {
++        return CS;
++      }
++    }
++  return nullptr;
++}
++
++// TODO: We can make this faster on large programs by applying
++// this patch from MLGO f46dd19b480496d2ba0a57d12935882e530f2b93.
++// This patch incrementally computes FunctionPropertiesInfo
++// instead of recomputing.
++int64_t InlineAdvisor::getLocalCalls(Function &F) {
++  return FAM.getResult<FunctionPropertiesAnalysis>(F)
++      .DirectCallsToDefinedFunctions;
++}
++
++unsigned InlineAdvisor::getCallLoopLevel(CallBase &CB) const {
++  Function *F = CB.getCaller();
++  BasicBlock *BB = CB.getParent();
++  LoopInfo &LI = FAM.getResult<LoopAnalysis>(*F);
++  return LI.getLoopDepth(BB);
++}
++
++uint64_t InlineAdvisor::getCalleeBlockFreq(CallBase &CB) const {
++  Function *F = CB.getCaller();
++  BasicBlock *BB = CB.getParent();
++  BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(*F);
++  return BFI.getBlockFreq(BB).getFrequency();
++}
++
++unsigned InlineAdvisor::getCallSiteHeight(CallBase *CB) {
++  Function *Caller = CB->getCaller();
++  return FunctionLevels[Caller];
++}
++#endif
++
+ InlineAdvisor::InlineAdvisor(Module &M, FunctionAnalysisManager &FAM,
+                              std::optional<InlineContext> IC)
+     : M(M), FAM(FAM), IC(IC),
+@@ -548,6 +607,35 @@ InlineAdvisor::InlineAdvisor(Module &M, FunctionAnalysisManager &FAM,
+         std::make_unique<ImportedFunctionsInliningStatistics>();
+     ImportedFunctionsStats->setModuleInfo(M);
+   }
++#if defined(ENABLE_ACPO)
++  std::unique_ptr<CallGraph> CG(std::make_unique<CallGraph>(M));
++  for (auto I = scc_begin(CG.get()); !I.isAtEnd(); ++I) {
++    const std::vector<CallGraphNode *> &CGNodes = *I;
++    unsigned Level = 0;
++    for (auto *CGNode : CGNodes) {
++      Function *F = CGNode->getFunction();
++      if (!F || F->isDeclaration())
++        continue;
++      for (auto &I : instructions(F)) {
++        if (auto *CS = getInlinableCS(I)) {
++          auto *Called = CS->getCalledFunction();
++          auto Pos = FunctionLevels.find(Called);
++          // In bottom up traversal, an inlinable callee is either in the
++          // same SCC, or to a function in a visited SCC. So not finding its
++          // level means we haven't visited it yet, meaning it's in this SCC.
++          if (Pos == FunctionLevels.end())
++            continue;
++          Level = std::max(Level, Pos->second + 1);
++        }
++      }
++    }
++    for (auto *CGNode : CGNodes) {
++      Function *F = CGNode->getFunction();
++      if (F && !F->isDeclaration())
++        FunctionLevels[F] = Level;
++    }
++  }
++#endif
+ }
+ 
+ InlineAdvisor::~InlineAdvisor() {
+@@ -639,6 +727,33 @@ std::unique_ptr<InlineAdvice> InlineAdvisor::getAdvice(CallBase &CB,
+   return getMandatoryAdvice(CB, Advice);
+ }
+ 
++#if defined(ENABLE_ACPO)
++bool InlineAdvisor::neverInline(CallBase &CB) const {
++  auto &Caller = *CB.getCaller();
++  auto &Callee = *CB.getCalledFunction();
++  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(Caller);
++  auto MandatoryKind = InlineAdvisor::getMandatoryKind(CB, FAM, ORE);
++  // If this is a "never inline" case, there won't be any changes to internal
++  // state we need to track, so we can just return the base InlineAdvice,
++  // which will do nothing interesting. Same thing if this is a recursive
++  // case.
++  return MandatoryKind == InlineAdvisor::MandatoryInliningKind::Never ||
++         &Caller == &Callee;
++}
++
++bool InlineAdvisor::isCSInlinable(CallBase &CB) const {
++  auto &Callee = *CB.getCalledFunction();
++
++  auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
++    return FAM.getResult<AssumptionAnalysis>(F);
++  };
++  auto &TIR = FAM.getResult<TargetIRAnalysis>(Callee);
++  auto IsCallSiteInlinable =
++      llvm::getInliningCostEstimate(CB, TIR, GetAssumptionCache);
++  return !!IsCallSiteInlinable;
++}
++#endif
++
+ OptimizationRemarkEmitter &InlineAdvisor::getCallerORE(CallBase &CB) {
+   return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*CB.getCaller());
+ }
+diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp
+index 0660a9993b6d..c7ea2eb8ffe9 100644
+--- a/llvm/lib/Analysis/MLInlineAdvisor.cpp
++++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp
+@@ -323,6 +323,11 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) {
+   auto &Caller = *CB.getCaller();
+   auto &Callee = *CB.getCalledFunction();
+ 
++#if defined(ENABLE_ACPO)
++  LLVM_DEBUG(dbgs() << "Advice on call: " << Caller.getName() << " to "
++                    << Callee.getName() << "\n");
++#endif
++
+   auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
+     return FAM.getResult<AssumptionAnalysis>(F);
+   };
+diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
+index a02c603a14a5..370b248c2b85 100644
+--- a/llvm/lib/IR/AsmWriter.cpp
++++ b/llvm/lib/IR/AsmWriter.cpp
+@@ -86,15 +86,19 @@
+ #include <utility>
+ #include <vector>
+ 
++#if defined(ENABLE_ACPO)
+ #include "llvm/ADT/StringSet.h"
+ #include "llvm/Analysis/LoopInfo.h"
+ #include "llvm/Support/CommandLine.h"
++#endif
+ 
+ using namespace llvm;
+ 
++#if defined(ENABLE_ACPO)
+ cl::opt<std::string> UnnamedVariablePrefix(
+     "unnamed-var-prefix", cl::Hidden,
+     cl::desc("Specify the prefix added to unnamed variables"), cl::init(""));
++#endif
+ 
+ // Make virtual table appear in this compilation unit.
+ AssemblyAnnotationWriter::~AssemblyAnnotationWriter() = default;
+@@ -2494,12 +2498,17 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
+   } else {
+     Slot = -1;
+   }
+-
++#if defined(ENABLE_ACPO)
+   if (Slot != -1) {
+     // By default, UnnamedVariablePrefix is empty so it matches original behaviour
+     // unless specified.
+     Out << Prefix << UnnamedVariablePrefix << Slot;
+   } else
++#else
++  if (Slot != -1)
++    Out << Prefix << Slot;
++  else
++#endif
+     Out << "<badref>";
+ }
+ 
+@@ -2635,7 +2644,11 @@ public:
+                                     SmallVector<BasicBlock *, 8> ExitBlocks);
+ #endif
+   void printArgument(const Argument *FA, AttributeSet Attrs);
++#if defined(ENABLE_ACPO)
+   void printBasicBlock(const BasicBlock *BB, bool PrintLabelOnly = false);
++#else
++  void printBasicBlock(const BasicBlock *BB);
++#endif
+   void printInstructionLine(const Instruction &I);
+   void printInstruction(const Instruction &I);
+ 
+@@ -4214,17 +4227,27 @@ void AssemblyWriter::printArgument(const Argument *Arg, AttributeSet Attrs) {
+   } else {
+     int Slot = Machine.getLocalSlot(Arg);
+     assert(Slot != -1 && "expect argument in function here");
++#if defined(ENABLE_ACPO)
+     // By default, UnnamedVariablePrefix is empty so it matches original behaviour
+     // unless specified.
+     Out << " %" << UnnamedVariablePrefix << Slot;
++#else
++    Out << " %" << Slot;
++#endif
+   }
+ }
+ 
++
+ /// printBasicBlock - This member is called for each basic block in a method.
++#if defined(ENABLE_ACPO)
+ void AssemblyWriter::printBasicBlock(const BasicBlock *BB,
+                                      bool PrintLabelOnly) {
+   assert(BB && BB->getParent() && "block without parent!");
+   bool IsEntryBlock = BB == &BB->getParent()->getEntryBlock();
++#else
++void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
++  bool IsEntryBlock = BB == &BB->getParent()->getEntryBlock();
++#endif
+   if (BB->hasName()) {              // Print out the label if it exists...
+     Out << "\n";
+     PrintLLVMName(Out, BB->getName(), LabelPrefix);
+@@ -4233,17 +4256,23 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB,
+     Out << "\n";
+     int Slot = Machine.getLocalSlot(BB);
+     if (Slot != -1) {
++#if defined(ENABLE_ACPO)
+       // By default, UnnamedVariablePrefix is empty so it matches original behaviour
+       // unless specified.
+       Out << UnnamedVariablePrefix << Slot << ":";
++#else
++      Out << Slot << ":";
++#endif
+     } else
+       Out << "<badref>:";
+   }
+ 
++#if defined(ENABLE_ACPO)
+   if (PrintLabelOnly) {
+     Out << "\n";
+     return;
+   }
++#endif
+ 
+   if (!IsEntryBlock) {
+     // Output predecessors for the block.
+@@ -4339,9 +4368,13 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
+     if (SlotNum == -1)
+       Out << "<badref> = ";
+     else {
++#if defined(ENABLE_ACPO)
+       // By default, UnnamedVariablePrefix is empty so it matches original behaviour
+       // unless specified.
+       Out << '%' << UnnamedVariablePrefix << SlotNum << " = ";
++#else
++      Out << '%' << SlotNum << " = ";
++#endif
+     }
+   }
+ 
+diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
+index 802667819c44..3a0a2494c36a 100644
+--- a/llvm/lib/Transforms/IPO/Inliner.cpp
++++ b/llvm/lib/Transforms/IPO/Inliner.cpp
+@@ -64,10 +64,21 @@
+ #include <functional>
+ #include <utility>
+ #include <vector>
+-#if defined(ENABLE_AUTOTUNER)
++#if defined(ENABLE_AUTOTUNER) || defined(ENABLE_ACPO)
+ #include "llvm/AutoTuner/AutoTuning.h"
+ #endif
+ 
++#if defined(ENABLE_ACPO)
++#include "llvm/Analysis/ACPOFIModel.h"
++#include "llvm/Analysis/ModelDataCollector.h"
++#include "llvm/IR/IRPrintingPasses.h"
++#include "llvm/Support/FormattedStream.h"
++#include "llvm/Support/raw_ostream.h"
++#include "llvm/Transforms/IPO.h"
++#include "llvm/Transforms/Utils/CallGraphUpdater.h"
++#include <optional>
++#endif
++
+ using namespace llvm;
+ 
+ #define DEBUG_TYPE "inline"
+@@ -149,6 +160,124 @@ static cl::opt<CallSiteFormat::Format> CGSCCInlineReplayFormat(
+                    "<Line Number>:<Column Number>.<Discriminator> (default)")),
+     cl::desc("How cgscc inline replay file is formatted"), cl::Hidden);
+ 
++#if defined(ENABLE_ACPO)
++static cl::opt<bool>
++    ACPOVerboseFI("acpo-verbose-fi", cl::init(false), cl::Hidden,
++                  cl::desc("Print ACPO invocation messages for FI."));
++
++static cl::opt<bool> FeatureDump("enable-fi-feature-dump", cl::init(false));
++
++//  Defined in 'lib/Analysis/ACPOFIModel.cpp'
++extern cl::opt<bool> EnableACPOFI;
++extern cl::opt<bool> EnableAOTFI;
++// In "llvm/lib/Analysis/ModelDataCollector.cpp"
++extern cl::opt<std::string> ACPOModelFile;
++
++namespace {
++/// Class for collecting inlining model data
++class ModelDataFICollector : public ModelDataCollector {
++public:
++  ModelDataFICollector(formatted_raw_ostream &OS, bool OnlyMandatory,
++                       std::string OutputFileName)
++      : ModelDataCollector(OS, OutputFileName), OnlyMandatory(OnlyMandatory) {}
++
++  void collectFeatures(CallBase *CB, InlineAdvisor *IA,
++                       FunctionAnalysisManager *FAM) {
++    bool MandatoryOnly = getOnlyMandatory();
++    resetRegisteredFeatures();
++    ACPOCollectFeatures::FeaturesInfo CallerFeatures{
++        {ACPOCollectFeatures::FeatureIndex::BasicBlockCount,
++         /* ACPOCollectFeatures::Scope::Function, */
++         /* ACPOCollectFeatures::GroupID::FPIRelated, */
++         {FAM, nullptr},
++         {CB->getCaller(), nullptr, nullptr, nullptr, nullptr},
++         {MandatoryOnly, IA}}};
++    ACPOCollectFeatures::FeaturesInfo CalleeFeatures{
++        {ACPOCollectFeatures::FeatureIndex::BasicBlockCount,
++         /* ACPOCollectFeatures::Scope::Function, */
++         /* ACPOCollectFeatures::GroupID::FPIRelated, */
++         {FAM, nullptr},
++         {CB->getCalledFunction(), nullptr, nullptr, nullptr, nullptr},
++         {MandatoryOnly, IA}}};
++    BasicBlock *GlobalBB = CB->getParent();
++    Function *GlobalF = GlobalBB->getParent();
++    Module *GlobalM = GlobalF->getParent();
++    ACPOCollectFeatures::FeatureInfo GlobalFeatureInfo{
++        ACPOCollectFeatures::FeatureIndex::NumOfFeatures,
++        {FAM, nullptr},
++        {GlobalF, CB, GlobalBB, GlobalM, nullptr},
++        {MandatoryOnly, IA}};
++    ACPOCollectFeatures::FeatureInfo CallerInfo{
++        ACPOCollectFeatures::FeatureIndex::NumOfFeatures,
++        {FAM, nullptr},
++        {CB->getCaller(), CB, GlobalBB, GlobalM, nullptr},
++        {MandatoryOnly, IA}};
++    ACPOCollectFeatures::FeatureInfo CalleeInfo{
++        ACPOCollectFeatures::FeatureIndex::NumOfFeatures,
++        {FAM, nullptr},
++        {CB->getCalledFunction(), CB, GlobalBB, GlobalM, nullptr},
++        {MandatoryOnly, IA}};
++
++    registerFeature({ACPOCollectFeatures::Scope::Function}, CalleeInfo,
++                    "callee");
++    registerFeature({ACPOCollectFeatures::Scope::Function}, CallerInfo,
++                    "caller");
++    registerFeature({ACPOCollectFeatures::Scope::CallSite}, GlobalFeatureInfo);
++    registerFeature({ACPOCollectFeatures::Scope::Module}, GlobalFeatureInfo);
++    ModelDataCollector::collectFeatures();
++  }
++  bool getOnlyMandatory() { return OnlyMandatory; }
++
++private:
++  bool OnlyMandatory = false;
++};
++
++llvm::SmallDenseSet<std::pair<CallGraphNode *, CallGraphSCC *>, 4>
++    InlinedInternalEdges =
++        llvm::SmallDenseSet<std::pair<CallGraphNode *, CallGraphSCC *>, 4>();
++} // end anonymous namespace
++
++/// helper function for getting advice with acpo infrastructure
++bool getACPOAdvice(CallBase *CB, std::unique_ptr<ACPOFIModel> &FI,
++                   ModelDataFICollector *MDC, InlineAdvisor *Advisor,
++                   FunctionAnalysisManager *FAM) {
++  bool ShouldInline = false;
++  // ------------------------------------------------------------------------
++  // Begin ACPO invocation
++  if ((EnableACPOFI || EnableAOTFI) && !MDC->getOnlyMandatory() &&
++      !Advisor->neverInline(*CB) && Advisor->isCSInlinable(*CB)) {
++    if (ACPOVerboseFI) {
++      errs() << "--- ACPOModel is activated\n";
++    }
++    MDC->collectFeatures(CB, Advisor, FAM);
++    std::vector<std::pair<std::string, std::string>> Features =
++        MDC->getFeatures();
++    FI->setMLCustomFeatures(Features);
++  }
++  std::unique_ptr<ACPOAdvice> Advice = FI->getAdvice();
++  Constant *Val = Advice->getField("FI-ShouldInline");
++  assert(Val != nullptr);
++  assert(isa<ConstantInt>(Val));
++  ConstantInt *ACPOInline = dyn_cast<ConstantInt>(Val);
++  ShouldInline = ACPOInline->getSExtValue();
++  if ((EnableACPOFI || EnableAOTFI) && ACPOVerboseFI) {
++    errs() << "ACPOModel's inline prediction: " << ShouldInline << "\n";
++  }
++  if (FeatureDump) {
++    MDC->collectFeatures(CB, Advisor, FAM);
++    std::vector<std::pair<std::string, std::string>> Features =
++        MDC->getFeatures();
++    if (MDC->isEmptyOutputFile()) {
++      MDC->printRow(true);
++    }
++    MDC->printRow();
++  }
++  return ShouldInline;
++  // End ACPO Invocation
++  // ---------------------------------------------------------------------
++}
++#endif
++
+ /// Return true if the specified inline history ID
+ /// indicates an inline history that includes the specified function.
+ static bool inlineHistoryIncludes(
+@@ -205,6 +334,14 @@ InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
+ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
+                                    CGSCCAnalysisManager &AM, LazyCallGraph &CG,
+                                    CGSCCUpdateResult &UR) {
++#if defined(ENABLE_ACPO)
++  if (EnableACPOFI || EnableAOTFI) {
++    // Need to clear the cache at the beggining of the inliner pass, since during
++    // optimization we may have transofrmed the code which invalidated the cache.
++    ACPOFIModel::clearCache();
++  }
++#endif
++
+   const auto &MAMProxy =
+       AM.getResult<ModuleAnalysisManagerCGSCCProxy>(InitialC, CG);
+   bool Changed = false;
+@@ -221,6 +358,10 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
+   Advisor.onPassEntry(&InitialC);
+ 
+   auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(&InitialC); });
++#if defined(ENABLE_ACPO)
++  if (EnableACPOFI || EnableAOTFI)
++    ACPOCollectFeatures::clearFunctionLevel();
++#endif
+ 
+   // We use a single common worklist for calls across the entire SCC. We
+   // process these in-order and append new calls introduced during inlining to
+@@ -377,8 +518,51 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
+         continue;
+       }
+ 
+-      std::unique_ptr<InlineAdvice> Advice =
+-          Advisor.getAdvice(*CB, OnlyMandatory);
++      std::unique_ptr<InlineAdvice> Advice = nullptr;
++ #if defined(ENABLE_ACPO)
++      std::unique_ptr<ACPOFIModel> FI = nullptr;
++      if (EnableACPOFI || EnableAOTFI) {
++        auto &ORE =
++            FAM.getResult<OptimizationRemarkEmitterAnalysis>(*CB->getCaller());
++        FI = std::make_unique<ACPOFIModel>(
++            CB, &Advisor, &ORE, OnlyMandatory, EnableACPOFI || EnableAOTFI);
++        std::error_code EC;
++        raw_fd_ostream RawOS(ACPOModelFile.getValue(), EC, sys::fs::CD_OpenAlways,
++                             sys::fs::FA_Write, sys::fs::OF_Append);
++        if (EC)
++          errs() << "Could not create/open training data file (Falling back to "
++                    "debug mode): "
++                 << EC.message() << "\n";
++
++        formatted_raw_ostream OS(RawOS);
++        ModelDataFICollector MDC(OS, OnlyMandatory, ACPOModelFile);
++        if (EnableACPOFI)
++          LLVM_DEBUG(dbgs() << "ACPO Python ML infra is activated" << "\n");
++        else if (EnableAOTFI)
++          LLVM_DEBUG(dbgs() << "ACPO AOT C++ ML infra is activated" << "\n");
++        bool ShouldInline = getACPOAdvice(CB, FI, &MDC, &Advisor, &FAM);
++
++        // Check whether we want to inline this callsite.
++        if (!ShouldInline) {
++          FI->recordUnattemptedInlining();
++          continue;
++        } else {
++          ACPOFIModel::invalidateCache(CB);
++        }
++      } else {
++        Advice = Advisor.getAdvice(*CB, OnlyMandatory);
++
++        // Check whether we want to inline this callsite.
++        if (!Advice)
++          continue;
++
++        if (!Advice->isInliningRecommended()) {
++          Advice->recordUnattemptedInlining();
++          continue;
++        }
++      }
++#else
++      Advice = Advisor.getAdvice(*CB, OnlyMandatory);
+ 
+       // Check whether we want to inline this callsite.
+       if (!Advice)
+@@ -388,6 +572,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
+         Advice->recordUnattemptedInlining();
+         continue;
+       }
++#endif
+ 
+       int CBCostMult =
+           getStringFnAttrAsInt(
+@@ -396,6 +581,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
+ 
+       // Setup the data structure used to plumb customization into the
+       // `InlineFunction` routine.
++#if defined(ENABLE_ACPO)
++      if ((EnableACPOFI || EnableAOTFI) && ACPOVerboseFI) {
++        Function &F2 = *CB->getCaller();
++        LLVM_DEBUG(dbgs() << "check: " << F2.getName() << ", "
++                          << Callee.getName() << "\n");
++      }
++#endif
+       InlineFunctionInfo IFI(
+           GetAssumptionCache, PSI,
+           &FAM.getResult<BlockFrequencyAnalysis>(*(CB->getCaller())),
+@@ -405,7 +597,14 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
+           InlineFunction(*CB, IFI, /*MergeAttributes=*/true,
+                          &FAM.getResult<AAManager>(*CB->getCaller()));
+       if (!IR.isSuccess()) {
++#if defined(ENABLE_ACPO)
++        if (EnableACPOFI || EnableAOTFI)
++          FI->recordUnsuccessfulInlining(IR);
++        else
++          Advice->recordUnsuccessfulInlining(IR);
++#else
+         Advice->recordUnsuccessfulInlining(IR);
++#endif
+         continue;
+       }
+ 
+@@ -494,10 +693,24 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
+           DeadFunctionsInComdats.push_back(&Callee);
+         }
+       }
++#if defined(ENABLE_ACPO)
++      if (EnableACPOFI || EnableAOTFI) {
++        if (CalleeWasDeleted)
++          FI->recordInliningWithCalleeDeleted();
++        else
++          FI->recordInlining();
++      } else {
++        if (CalleeWasDeleted)
++          Advice->recordInliningWithCalleeDeleted();
++        else
++          Advice->recordInlining();
++      }
++#else
+       if (CalleeWasDeleted)
+         Advice->recordInliningWithCalleeDeleted();
+       else
+         Advice->recordInlining();
++#endif
+     }
+ 
+     // Back the call index up by one to put us in a good position to go around
+diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
+index 1401352647cd..671a33309a1b 100644
+--- a/llvm/tools/opt/opt.cpp
++++ b/llvm/tools/opt/opt.cpp
+@@ -430,6 +430,9 @@ int main(int argc, char **argv) {
+   initializeTransformUtils(Registry);
+   initializeInstCombine(Registry);
+   initializeTarget(Registry);
++#if defined(ENABLE_ACPO)
++  initializeDumpCallsiteLegacyPass(Registry);
++#endif
+   // For codegen passes, only passes that do IR to IR transformation are
+   // supported.
+   initializeExpandLargeDivRemLegacyPassPass(Registry);
+-- 
+2.38.1.windows.1
+
diff --git a/0033-Find-Python3-in-default-env-PATH-for-ACPO.patch b/0033-Find-Python3-in-default-env-PATH-for-ACPO.patch
new file mode 100644
index 0000000..6c93c96
--- /dev/null
+++ b/0033-Find-Python3-in-default-env-PATH-for-ACPO.patch
@@ -0,0 +1,34 @@
+From d4cfa4fd4496735ea45afcd2b0cfb3607cadd1c9 Mon Sep 17 00:00:00 2001
+From: yinrun <lvyinrun@huawei.com>
+Date: Thu, 17 Oct 2024 18:47:40 +0800
+Subject: [PATCH] Find Python3 in default env PATH for ACPO
+
+Enable the use of user python version, avoid the wrong version of python without AI infra.
+---
+ llvm/lib/Analysis/ACPOMLInterface.cpp | 10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+diff --git a/llvm/lib/Analysis/ACPOMLInterface.cpp b/llvm/lib/Analysis/ACPOMLInterface.cpp
+index f48eb46638e3..7d84bd5112d6 100644
+--- a/llvm/lib/Analysis/ACPOMLInterface.cpp
++++ b/llvm/lib/Analysis/ACPOMLInterface.cpp
+@@ -146,7 +146,15 @@ ACPOMLPythonInterface::ACPOMLPythonInterface() : NextID{0} {
+   }
+ 
+   int32_t PID = (int32_t) llvm::sys::Process::getProcessId();
+-  std::string ExecPython = "/usr/bin/python3";
++  std::string ExecPython;
++  llvm::ErrorOr<std::string> Res = llvm::sys::findProgramByName("python3");
++  if (std::error_code EC = Res.getError()) {
++      LLVM_DEBUG(dbgs() << "python3 could not be found, error_code " << EC.value() << "\n");
++      return;
++  } else {
++    ExecPython = Res.get();
++    LLVM_DEBUG(dbgs() << "python3 version found in " << ExecPython << "\n");
++  }
+   std::string
+       PythonScript = *Env + "/" + std::string(ACPO_ML_PYTHON_INTERFACE_PY);
+   std::string PIDStr = std::to_string(PID);
+-- 
+2.38.1.windows.1
+
diff --git a/0034-AArch64-Support-HiSilicon-s-HIP09-sched-model.patch b/0034-AArch64-Support-HiSilicon-s-HIP09-sched-model.patch
new file mode 100644
index 0000000..590bec6
--- /dev/null
+++ b/0034-AArch64-Support-HiSilicon-s-HIP09-sched-model.patch
@@ -0,0 +1,2201 @@
+From 1560015fbbd8cd73f31c8573c44dcd987a803aef Mon Sep 17 00:00:00 2001
+From: xiajingze <xiajingze1@huawei.com>
+Date: Thu, 24 Oct 2024 10:29:47 +0800
+Subject: [PATCH] [AArch64] Support HiSilicon's HIP09 sched model
+
+Signed-off-by: xiajingze <xiajingze1@huawei.com>
+---
+ llvm/lib/Target/AArch64/AArch64.td           |    4 +-
+ llvm/lib/Target/AArch64/AArch64SchedHIP09.td | 2158 ++++++++++++++++++
+ 2 files changed, 2160 insertions(+), 2 deletions(-)
+ create mode 100644 llvm/lib/Target/AArch64/AArch64SchedHIP09.td
+
+diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
+index fdb931a0fe6c..edd5b91e3ad1 100644
+--- a/llvm/lib/Target/AArch64/AArch64.td
++++ b/llvm/lib/Target/AArch64/AArch64.td
+@@ -768,6 +768,7 @@ include "AArch64SchedThunderX2T99.td"
+ include "AArch64SchedA64FX.td"
+ include "AArch64SchedThunderX3T110.td"
+ include "AArch64SchedTSV110.td"
++include "AArch64SchedHIP09.td"
+ include "AArch64SchedAmpere1.td"
+ include "AArch64SchedNeoverseN1.td"
+ include "AArch64SchedNeoverseN2.td"
+@@ -1491,8 +1492,7 @@ def : ProcessorModel<"thunderx3t110", ThunderX3T110Model,
+ // HiSilicon Processors.
+ def : ProcessorModel<"tsv110", TSV110Model, ProcessorFeatures.TSV110,
+                      [TuneTSV110]>;
+-// FIXME: HiSilicon HIP09 is currently modeled as a Cortex-A57.
+-def : ProcessorModel<"hip09", CortexA57Model, ProcessorFeatures.HIP09,
++def : ProcessorModel<"hip09", HIP09Model, ProcessorFeatures.HIP09,
+                      [TuneHIP09]>;
+ 
+ // Support cyclone as an alias for apple-a7 so we can still LTO old bitcode.
+diff --git a/llvm/lib/Target/AArch64/AArch64SchedHIP09.td b/llvm/lib/Target/AArch64/AArch64SchedHIP09.td
+new file mode 100644
+index 000000000000..11cd250f6c7f
+--- /dev/null
++++ b/llvm/lib/Target/AArch64/AArch64SchedHIP09.td
+@@ -0,0 +1,2158 @@
++//=- AArch64SchedHIP09.td - Huawei HIP09 Scheduling Defs ---*- tablegen -*-=//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++//
++// This file defines the machine model for Huawei HIP09 to support instruction
++// scheduling and other instruction cost heuristics.
++//
++//===----------------------------------------------------------------------===//
++
++def HIP09Model : SchedMachineModel {
++  let IssueWidth            =   6; // HIP09 can dispatch 6 micro-ops per cycle.
++  let MicroOpBufferSize     =  88; // Based on the reorder buffer.
++  let LoadLatency           =   4; // Basic latency for most load instructions.
++  let MispredictPenalty     =  14; // Based on ALU pipeline depth.
++  let LoopMicroOpBufferSize =  16; // Based on the instruction queue size.
++  let CompleteModel         =   1;
++
++  list<Predicate> UnsupportedFeatures = !listconcat(PAUnsupported.F,
++                                                    SMEUnsupported.F,
++                                                    SVE2Unsupported.F,
++                                                   [HasMTE, HasSVE2p1_or_HasSME]);
++}
++
++let SchedModel = HIP09Model in {
++
++// HIP09 has 18 pipelines. The 4 Advanced SIMD&FP units handle different
++// sets of operations, of which 2 can also handle SVE.
++
++// These are also defined in the upstream AArch64SchedHIP09.td.
++// In our implementation, HIP09UnitAB is called HIP09UnitBRU instead.
++def HIP09UnitBRU      : ProcResource<2>;   // Branch 0/1
++def HIP09UnitALUS0    : ProcResource<1>;   // Integer ALU single cycle 0
++def HIP09UnitALUS1    : ProcResource<1>;   // Integer ALU single cycle 1
++def HIP09UnitALUS23   : ProcResource<2>;   // Integer ALU single cycle 2/3
++def HIP09UnitALUM0    : ProcResource<1>;   // Integer ALU multi cycle 0
++def HIP09UnitALUM1    : ProcResource<1>;   // Integer ALU multi cycle 1
++def HIP09UnitLD       : ProcResource<2>;   // Load address generation and special memory 0/1
++def HIP09UnitST       : ProcResource<2>;   // Store address generation and special memory 0/1
++def HIP09UnitFSU0     : ProcResource<1>;   // SIMD&FP 0, can handle sve
++def HIP09UnitFSU2     : ProcResource<1>;   // SIMD&FP 2, can handle sve
++def HIP09UnitFSU13    : ProcResource<2>;   // SIMD&FP 1/3
++def HIP09UnitSTD      : ProcResource<2>;   // Store data 0/1
++
++def HIP09UnitALUS01   : ProcResGroup<[HIP09UnitALUS0, HIP09UnitALUS1]>;
++def HIP09UnitALUS     : ProcResGroup<[HIP09UnitALUS0, HIP09UnitALUS1, HIP09UnitALUS23]>;
++def HIP09UnitALUM     : ProcResGroup<[HIP09UnitALUM0, HIP09UnitALUM1]>;
++def HIP09UnitFSU02    : ProcResGroup<[HIP09UnitFSU0, HIP09UnitFSU2]>;
++def HIP09UnitFSU      : ProcResGroup<[HIP09UnitFSU0, HIP09UnitFSU2, HIP09UnitFSU13]>;
++
++//===----------------------------------------------------------------------===//
++//
++// Contains all of the HIP09-specific SchedWriteRes types. The approach below
++// is to define a generic SchedWriteRes for every combination of latency and
++// micro-ops. The naming conventions is to use a prefix, one field for latency,
++// and one or more microOp count/type designators.
++//
++//   Prefix: HIP09Write
++//   Latency: #cyc
++//   Micro-op Count/Types: #(BRU|ALUS01|ALUS23|ALUS|ALUM1|ALUM2|ALUM|LD|ST|FSU0|FSU2|FSU02|FSU|STD)
++//
++// e.g. HIP09Write_6cyc_1ALUS_6LD_4FSU means the total latency is 6 cycles,
++// and 11 micro-ops are issued down 1 ALUS pipe, 6 LD pipes, and 4 FSU
++// pipes
++
++def HIP09Write_0cyc                       : SchedWriteRes<[]>                 { let Latency = 0; }
++
++def HIP09Write_1cyc_1BRU                  : SchedWriteRes<[HIP09UnitBRU]>     { let Latency = 1; }
++
++def HIP09Write_1cyc_1ALUS                 : SchedWriteRes<[HIP09UnitALUS]>    { let Latency = 1; }
++def HIP09Write_1cyc_1ALUS1                : SchedWriteRes<[HIP09UnitALUS1]>   { let Latency = 1; }
++def HIP09Write_1cyc_1ALUS01               : SchedWriteRes<[HIP09UnitALUS01]>  { let Latency = 1; }
++def HIP09Write_2cyc_1ALUS01               : SchedWriteRes<[HIP09UnitALUS01]>  { let Latency = 2; }
++def HIP09Write_3cyc_1ALUS01               : SchedWriteRes<[HIP09UnitALUS01]>  { let Latency = 3; }
++def HIP09Write_1cyc_1ALUS23               : SchedWriteRes<[HIP09UnitALUS23]>  { let Latency = 1; }
++def HIP09Write_2cyc_1ALUS23               : SchedWriteRes<[HIP09UnitALUS23]>  { let Latency = 2; }
++
++def HIP09Write_2cyc_1ALUM                 : SchedWriteRes<[HIP09UnitALUM]>    { let Latency = 2; }
++def HIP09Write_3cyc_1ALUM                 : SchedWriteRes<[HIP09UnitALUM]>    { let Latency = 3; }
++def HIP09Write_5cyc_1ALUM1                : SchedWriteRes<[HIP09UnitALUM1]>   { let Latency = 5; }
++def HIP09Write_12cyc_1ALUM0_12RC          : SchedWriteRes<[HIP09UnitALUM0]>   { let Latency = 12; let ResourceCycles = [12]; }
++def HIP09Write_20cyc_1ALUM0_20RC          : SchedWriteRes<[HIP09UnitALUM0]>   { let Latency = 20; let ResourceCycles = [20]; }
++
++def HIP09Write_1cyc_1ST                   : SchedWriteRes<[HIP09UnitST]>      { let Latency = 1; }
++
++def HIP09Write_1cyc_1FSU                  : SchedWriteRes<[HIP09UnitFSU]>     { let Latency = 1; }
++def HIP09Write_2cyc_1FSU                  : SchedWriteRes<[HIP09UnitFSU]>     { let Latency = 2; }
++def HIP09Write_3cyc_1FSU                  : SchedWriteRes<[HIP09UnitFSU]>     { let Latency = 3; }
++def HIP09Write_4cyc_1FSU                  : SchedWriteRes<[HIP09UnitFSU]>     { let Latency = 4; }
++def HIP09Write_5cyc_1FSU                  : SchedWriteRes<[HIP09UnitFSU]>     { let Latency = 5; }
++def HIP09Write_5cyc_1FSU_3RC              : SchedWriteRes<[HIP09UnitFSU]>     { let Latency = 5;  let ResourceCycles = [3]; }
++def HIP09Write_7cyc_1FSU_3RC              : SchedWriteRes<[HIP09UnitFSU]>     { let Latency = 7;  let ResourceCycles = [3]; }
++def HIP09Write_9cyc_1FSU_5RC              : SchedWriteRes<[HIP09UnitFSU]>     { let Latency = 9;  let ResourceCycles = [5]; }
++def HIP09Write_9cyc_1FSU_8RC              : SchedWriteRes<[HIP09UnitFSU]>     { let Latency = 9;  let ResourceCycles = [8]; }
++def HIP09Write_10cyc_1FSU_6RC             : SchedWriteRes<[HIP09UnitFSU]>     { let Latency = 10; let ResourceCycles = [6]; }
++def HIP09Write_13cyc_1FSU_9RC             : SchedWriteRes<[HIP09UnitFSU]>     { let Latency = 13; let ResourceCycles = [9]; }
++def HIP09Write_15cyc_1FSU_11RC            : SchedWriteRes<[HIP09UnitFSU]>     { let Latency = 15; let ResourceCycles = [11]; }
++def HIP09Write_21cyc_1FSU_17RC            : SchedWriteRes<[HIP09UnitFSU]>     { let Latency = 21; let ResourceCycles = [17]; }
++def HIP09Write_25cyc_1FSU_21RC            : SchedWriteRes<[HIP09UnitFSU]>     { let Latency = 25; let ResourceCycles = [21]; }
++def HIP09Write_1cyc_1FSU02                : SchedWriteRes<[HIP09UnitFSU02]>   { let Latency = 1; }
++def HIP09Write_2cyc_1FSU02                : SchedWriteRes<[HIP09UnitFSU02]>   { let Latency = 2; }
++def HIP09Write_3cyc_1FSU02                : SchedWriteRes<[HIP09UnitFSU02]>   { let Latency = 3; }
++def HIP09Write_4cyc_1FSU02                : SchedWriteRes<[HIP09UnitFSU02]>   { let Latency = 4; }
++def HIP09Write_4cyc_1FSU02_4RC            : SchedWriteRes<[HIP09UnitFSU02]>   { let Latency = 4;  let ResourceCycles = [4]; }
++def HIP09Write_5cyc_1FSU02                : SchedWriteRes<[HIP09UnitFSU02]>   { let Latency = 5; }
++def HIP09Write_7cyc_1FSU02_3RC            : SchedWriteRes<[HIP09UnitFSU02]>   { let Latency = 7;  let ResourceCycles = [3]; }
++def HIP09Write_9cyc_1FSU02_3RC            : SchedWriteRes<[HIP09UnitFSU02]>   { let Latency = 9;  let ResourceCycles = [3]; }
++def HIP09Write_12cyc_1FSU02_4RC           : SchedWriteRes<[HIP09UnitFSU02]>   { let Latency = 12; let ResourceCycles = [4]; }
++def HIP09Write_12cyc_1FSU02_8RC           : SchedWriteRes<[HIP09UnitFSU02]>   { let Latency = 12; let ResourceCycles = [8]; }
++def HIP09Write_13cyc_1FSU02_9RC           : SchedWriteRes<[HIP09UnitFSU02]>   { let Latency = 13; let ResourceCycles = [9]; }
++def HIP09Write_15cyc_1FSU02_11RC          : SchedWriteRes<[HIP09UnitFSU02]>   { let Latency = 15; let ResourceCycles = [11]; }
++def HIP09Write_17cyc_1FSU02_13RC          : SchedWriteRes<[HIP09UnitFSU02]>   { let Latency = 17; let ResourceCycles = [13]; }
++def HIP09Write_20cyc_1FSU02_16RC          : SchedWriteRes<[HIP09UnitFSU02]>   { let Latency = 20; let ResourceCycles = [16]; }
++def HIP09Write_21cyc_1FSU02_17RC          : SchedWriteRes<[HIP09UnitFSU02]>   { let Latency = 21; let ResourceCycles = [17]; }
++def HIP09Write_25cyc_1FSU02_21RC          : SchedWriteRes<[HIP09UnitFSU02]>   { let Latency = 25; let ResourceCycles = [21]; }
++def HIP09Write_36cyc_1FSU02_32RC          : SchedWriteRes<[HIP09UnitFSU02]>   { let Latency = 36; let ResourceCycles = [32]; }
++def HIP09Write_1cyc_1FSU2                 : SchedWriteRes<[HIP09UnitFSU2]>    { let Latency = 1; }
++def HIP09Write_2cyc_1FSU2                 : SchedWriteRes<[HIP09UnitFSU2]>    { let Latency = 2; }
++def HIP09Write_4cyc_1FSU2                 : SchedWriteRes<[HIP09UnitFSU2]>    { let Latency = 4; }
++def HIP09Write_4cyc_1LD                   : SchedWriteRes<[HIP09UnitLD]>      { let Latency = 4; }
++def HIP09Write_5cyc_1LD                   : SchedWriteRes<[HIP09UnitLD]>      { let Latency = 5; }
++def HIP09Write_6cyc_1LD                   : SchedWriteRes<[HIP09UnitLD]>      { let Latency = 6; }
++def HIP09Write_6cyc_1LD_3RC               : SchedWriteRes<[HIP09UnitLD]>      { let Latency = 6;  let ResourceCycles = [3]; }
++def HIP09Write_6cyc_1LD_4RC               : SchedWriteRes<[HIP09UnitLD]>      { let Latency = 6;  let ResourceCycles = [4]; }
++def HIP09Write_16cyc_1LD_4RC              : SchedWriteRes<[HIP09UnitLD]>      { let Latency = 16; let ResourceCycles = [4]; }
++def HIP09Write_18cyc_1LD_4RC              : SchedWriteRes<[HIP09UnitLD]>      { let Latency = 18; let ResourceCycles = [4]; }
++
++def HIP09Write_1cyc_2FSU                  : SchedWriteRes<[HIP09UnitFSU, HIP09UnitFSU]>
++{
++  let Latency = 1;
++  let NumMicroOps = 2;
++}
++def HIP09Write_2cyc_2FSU                  : SchedWriteRes<[HIP09UnitFSU, HIP09UnitFSU]>
++{
++  let Latency = 2;
++  let NumMicroOps = 2;
++}
++def HIP09Write_2cyc_2FSU02                : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitFSU02]>
++{
++  let Latency = 2;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_3cyc_2FSU                  : SchedWriteRes<[HIP09UnitFSU, HIP09UnitFSU]>
++{
++  let Latency = 3;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_3cyc_2FSU02                : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitFSU02]>
++{
++  let Latency = 3;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_4cyc_2FSU                  : SchedWriteRes<[HIP09UnitFSU, HIP09UnitFSU]>
++{
++  let Latency = 4;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_4cyc_2FSU02                : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitFSU02]>
++{
++  let Latency = 4;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_4cyc_4FSU                  : SchedWriteRes<[HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU]>
++{
++  let Latency = 4;
++  let NumMicroOps = 4;
++}
++
++def HIP09Write_5cyc_2FSU                  : SchedWriteRes<[HIP09UnitFSU, HIP09UnitFSU]>
++{
++  let Latency = 5;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_6cyc_2FSU                  : SchedWriteRes<[HIP09UnitFSU, HIP09UnitFSU]>
++{
++  let Latency = 6;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_6cyc_2FSU02                : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitFSU02]>
++{
++  let Latency = 6;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_6cyc_4FSU                  : SchedWriteRes<[HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU]>
++{
++  let Latency = 6;
++  let NumMicroOps = 4;
++}
++
++def HIP09Write_7cyc_4FSU                  : SchedWriteRes<[HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU]>
++{
++  let Latency = 7;
++  let NumMicroOps = 4;
++}
++
++def HIP09Write_9cyc_4FSU                  : SchedWriteRes<[HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU]>
++{
++  let Latency = 9;
++  let NumMicroOps = 4;
++}
++
++def HIP09Write_6cyc_1BRU_1ALUM1           : SchedWriteRes<[HIP09UnitBRU, HIP09UnitALUM1]>
++{
++  let Latency = 6;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_1cyc_1ST_1STD              : SchedWriteRes<[HIP09UnitST, HIP09UnitSTD]>
++{
++  let Latency = 1;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_1cyc_2ST_2STD              : SchedWriteRes<[HIP09UnitST, HIP09UnitST, HIP09UnitSTD, HIP09UnitSTD]>
++{
++  let Latency = 1;
++  let NumMicroOps = 4;
++}
++
++def HIP09Write_2cyc_1ST_1STD              : SchedWriteRes<[HIP09UnitST, HIP09UnitSTD]>
++{
++  let Latency = 2;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_2cyc_2ST_2STD              : SchedWriteRes<[HIP09UnitST, HIP09UnitST, HIP09UnitSTD, HIP09UnitSTD]>
++{
++  let Latency = 2;
++  let NumMicroOps = 4;
++}
++
++def HIP09Write_2cyc_4ST_4STD              : SchedWriteRes<[HIP09UnitST, HIP09UnitST, HIP09UnitST, HIP09UnitST,
++                                                              HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD]>
++{
++  let Latency = 2;
++  let NumMicroOps = 8;
++}
++
++def HIP09Write_3cyc_2ST_2STD              : SchedWriteRes<[HIP09UnitST, HIP09UnitST, HIP09UnitSTD, HIP09UnitSTD]>
++{
++  let Latency = 3;
++  let NumMicroOps = 4;
++}
++
++def HIP09Write_4cyc_3ST_3STD              : SchedWriteRes<[HIP09UnitST, HIP09UnitST, HIP09UnitST,
++                                                              HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD]>
++{
++  let Latency = 4;
++  let NumMicroOps = 6;
++}
++
++def HIP09Write_4cyc_8ST_8STD              : SchedWriteRes<[HIP09UnitST, HIP09UnitST, HIP09UnitST, HIP09UnitST,
++                                                              HIP09UnitST, HIP09UnitST, HIP09UnitST, HIP09UnitST,
++                                                              HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD,
++                                                              HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD]>
++{
++  let Latency = 4;
++  let NumMicroOps = 16;
++}
++
++
++def HIP09Write_5cyc_4ST_4STD              : SchedWriteRes<[HIP09UnitST, HIP09UnitST, HIP09UnitST, HIP09UnitST,
++                                                              HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD]>
++{
++  let Latency = 5;
++  let NumMicroOps = 8;
++}
++
++def HIP09Write_1cyc_1ST_1STD_1ALUS        : SchedWriteRes<[HIP09UnitST, HIP09UnitSTD, HIP09UnitALUS]>
++{
++  let Latency = 1;
++  let NumMicroOps = 3;
++}
++
++def HIP09Write_2cyc_1ST_1STD_1ALUS        : SchedWriteRes<[HIP09UnitST, HIP09UnitSTD, HIP09UnitALUS]>
++{
++  let Latency = 2;
++  let NumMicroOps = 3;
++}
++
++def HIP09Write_2cyc_2ST_2STD_2ALUS        : SchedWriteRes<[HIP09UnitST, HIP09UnitST, HIP09UnitSTD,
++                                                              HIP09UnitSTD, HIP09UnitALUS, HIP09UnitALUS]>
++{
++  let Latency = 2;
++  let NumMicroOps = 6;
++}
++
++def HIP09Write_2cyc_1BRU_1ALUS23          : SchedWriteRes<[HIP09UnitBRU, HIP09UnitALUS23]>
++{
++  let Latency = 2;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_4cyc_1LD_1ALUS             : SchedWriteRes<[HIP09UnitLD, HIP09UnitALUS]>
++{
++  let Latency = 4;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_6cyc_1LD_1ALUS             : SchedWriteRes<[HIP09UnitLD, HIP09UnitALUS]>
++{
++  let Latency = 6;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_9cyc_1LD_1ALUM1            : SchedWriteRes<[HIP09UnitLD, HIP09UnitALUM1]>
++{
++  let Latency = 9;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_1cyc_1ST_1ALUM             : SchedWriteRes<[HIP09UnitST, HIP09UnitALUM]>
++{
++  let Latency = 1;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_2cyc_1ST_1ALUM             : SchedWriteRes<[HIP09UnitST, HIP09UnitALUM]>
++{
++  let Latency = 2;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_4cyc_1ALUS01_1FSU          : SchedWriteRes<[HIP09UnitALUS01, HIP09UnitFSU]>
++{
++  let Latency = 4;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_1cyc_1ST_1ALUS_1ALUM       : SchedWriteRes<[HIP09UnitST, HIP09UnitALUS, HIP09UnitALUM]>
++{
++  let Latency = 1;
++  let NumMicroOps = 3;
++}
++
++def HIP09Write_3cyc_1ST_1ALUS_1ALUM       : SchedWriteRes<[HIP09UnitST, HIP09UnitALUS, HIP09UnitALUM]>
++{
++  let Latency = 3;
++  let NumMicroOps = 3;
++}
++
++def HIP09Write_4cyc_1FSU_1ALUS23          : SchedWriteRes<[HIP09UnitFSU, HIP09UnitALUS23]>
++{
++  let Latency = 4;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_7cyc_1ALUS01_1FSU_1ALUS23  : SchedWriteRes<[HIP09UnitALUS01, HIP09UnitFSU, HIP09UnitALUS23]>
++{
++  let Latency = 7;
++  let NumMicroOps = 3;
++}
++
++def HIP09Write_5cyc_1FSU02_1ALUS          : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitALUS]>
++{
++  let Latency = 5;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_6cyc_1FSU02_1ALUS          : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitALUS]>
++{
++  let Latency = 6;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_5cyc_1ALUS01_1FSU          : SchedWriteRes<[HIP09UnitALUS01, HIP09UnitFSU]>
++{
++  let Latency = 5;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_5cyc_1FSU_1ALUS23          : SchedWriteRes<[HIP09UnitFSU, HIP09UnitALUS23]>
++{
++  let Latency = 5;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_5cyc_1FSU02_1ALUS23        : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitALUS23]>
++{
++  let Latency = 5;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_6cyc_4FSU02_4ALUS23        : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitALUS23, HIP09UnitALUS23, HIP09UnitALUS23, HIP09UnitALUS23]>
++{
++  let Latency = 6;
++  let NumMicroOps = 8;
++}
++
++def HIP09Write_7cyc_4FSU02_4ALUS23        : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitALUS23, HIP09UnitALUS23, HIP09UnitALUS23, HIP09UnitALUS23]>
++{
++  let Latency = 7;
++  let NumMicroOps = 8;
++}
++
++def HIP09Write_6cyc_6FSU02_6ALUS23        : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitALUS23, HIP09UnitALUS23, HIP09UnitALUS23,
++                                                              HIP09UnitALUS23, HIP09UnitALUS23, HIP09UnitALUS23]>
++{
++  let Latency = 6;
++  let NumMicroOps = 12;
++}
++
++def HIP09Write_7cyc_6FSU02_6ALUS23        : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitALUS23, HIP09UnitALUS23, HIP09UnitALUS23,
++                                                              HIP09UnitALUS23, HIP09UnitALUS23, HIP09UnitALUS23]>
++{
++  let Latency = 7;
++  let NumMicroOps = 12;
++}
++
++def HIP09Write_8cyc_6FSU02_6ALUS23        : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitALUS23, HIP09UnitALUS23, HIP09UnitALUS23,
++                                                              HIP09UnitALUS23, HIP09UnitALUS23, HIP09UnitALUS23]>
++{
++  let Latency = 8;
++  let NumMicroOps = 12;
++}
++
++def HIP09Write_9cyc_6FSU02_6ALUS23        : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitALUS23, HIP09UnitALUS23, HIP09UnitALUS23,
++                                                              HIP09UnitALUS23, HIP09UnitALUS23, HIP09UnitALUS23]>
++{
++  let Latency = 9;
++  let NumMicroOps = 12;
++}
++
++def HIP09Write_9cyc_18FSU02_9ALUM1        : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitALUM1, HIP09UnitALUM1,
++                                                              HIP09UnitALUM1, HIP09UnitALUM1, HIP09UnitALUM1, HIP09UnitALUM1,
++                                                              HIP09UnitALUM1, HIP09UnitALUM1, HIP09UnitALUM1]>
++{
++  let Latency = 9;
++  let NumMicroOps = 27;
++}
++
++def HIP09Write_6cyc_2LD                   : SchedWriteRes<[HIP09UnitLD, HIP09UnitLD]>
++{
++  let Latency = 6;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_7cyc_1LD_1FSU              : SchedWriteRes<[HIP09UnitLD, HIP09UnitFSU]>
++{
++  let Latency = 7;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_8cyc_1LD_1FSU              : SchedWriteRes<[HIP09UnitLD, HIP09UnitFSU]>
++{
++  let Latency = 8;
++  let NumMicroOps = 2;
++}
++
++def HIP09Write_8cyc_2LD_2FSU              : SchedWriteRes<[HIP09UnitLD, HIP09UnitLD, HIP09UnitFSU, HIP09UnitFSU]>
++{
++  let Latency = 8;
++  let NumMicroOps = 4;
++}
++
++def HIP09Write_8cyc_2LD_2FSU02            : SchedWriteRes<[HIP09UnitLD, HIP09UnitLD, HIP09UnitFSU02, HIP09UnitFSU02]>
++{
++  let Latency = 8;
++  let NumMicroOps = 4;
++}
++
++def HIP09Write_9cyc_3LD_3FSU              : SchedWriteRes<[HIP09UnitLD, HIP09UnitLD, HIP09UnitLD,
++                                                              HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU]>
++{
++  let Latency = 9;
++  let NumMicroOps = 6;
++}
++
++def HIP09Write_9cyc_4LD_4FSU02            : SchedWriteRes<[HIP09UnitLD, HIP09UnitLD, HIP09UnitLD, HIP09UnitLD,
++                                                              HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02]>
++{
++  let Latency = 9;
++  let NumMicroOps = 8;
++}
++
++def HIP09Write_11cyc_6LD_6FSU02            : SchedWriteRes<[HIP09UnitLD, HIP09UnitLD, HIP09UnitLD, HIP09UnitLD, HIP09UnitLD, HIP09UnitLD,
++                                                              HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02]>
++{
++  let Latency = 11;
++  let NumMicroOps = 12;
++}
++
++def HIP09Write_16cyc_16LD_16FSU02         : SchedWriteRes<[HIP09UnitLD, HIP09UnitLD, HIP09UnitLD, HIP09UnitLD,
++                                                              HIP09UnitLD, HIP09UnitLD, HIP09UnitLD, HIP09UnitLD,
++                                                              HIP09UnitLD, HIP09UnitLD, HIP09UnitLD, HIP09UnitLD,
++                                                              HIP09UnitLD, HIP09UnitLD, HIP09UnitLD, HIP09UnitLD,
++                                                              HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02]>
++{
++  let Latency = 16;
++  let NumMicroOps = 32;
++}
++
++def HIP09Write_12cyc_8LD_8FSU             : SchedWriteRes<[HIP09UnitLD, HIP09UnitLD, HIP09UnitLD, HIP09UnitLD,
++                                                              HIP09UnitLD, HIP09UnitLD, HIP09UnitLD, HIP09UnitLD,
++                                                              HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU,
++                                                              HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU]>
++{
++  let Latency = 12;
++  let NumMicroOps = 16;
++}
++
++def HIP09Write_13cyc_8LD_8FSU             : SchedWriteRes<[HIP09UnitLD, HIP09UnitLD, HIP09UnitLD, HIP09UnitLD,
++                                                              HIP09UnitLD, HIP09UnitLD, HIP09UnitLD, HIP09UnitLD,
++                                                              HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU,
++                                                              HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU]>
++{
++  let Latency = 13;
++  let NumMicroOps = 16;
++}
++
++def HIP09Write_3cyc_1FSU02_1ST_1STD       : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitST, HIP09UnitSTD]>
++{
++  let Latency = 3;
++  let NumMicroOps = 3;
++}
++
++def HIP09Write_4cyc_1FSU_1ST_1STD         : SchedWriteRes<[HIP09UnitFSU, HIP09UnitST, HIP09UnitSTD]>
++{
++  let Latency = 4;
++  let NumMicroOps = 3;
++}
++
++def HIP09Write_6cyc_2FSU_2ST_2STD         : SchedWriteRes<[HIP09UnitFSU, HIP09UnitFSU,
++                                                              HIP09UnitST, HIP09UnitST,
++                                                              HIP09UnitSTD, HIP09UnitSTD]>
++{
++  let Latency = 6;
++  let NumMicroOps = 6;
++}
++
++def HIP09Write_6cyc_3FSU02_3ST_3STD       : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitST, HIP09UnitST, HIP09UnitST,
++                                                              HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD]>
++{
++  let Latency = 6;
++  let NumMicroOps = 9;
++}
++
++def HIP09Write_6cyc_4FSU02_4ST_4STD       : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitST, HIP09UnitST, HIP09UnitST, HIP09UnitST,
++                                                              HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD]>
++{
++  let Latency = 6;
++  let NumMicroOps = 12;
++}
++
++def HIP09Write_7cyc_3FSU_3ST_3STD         : SchedWriteRes<[HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU,
++                                                              HIP09UnitST, HIP09UnitST, HIP09UnitST,
++                                                              HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD]>
++{
++  let Latency = 7;
++  let NumMicroOps = 9;
++}
++
++def HIP09Write_8cyc_16FSU02_16ST_16STD    : SchedWriteRes<[HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02, HIP09UnitFSU02,
++                                                              HIP09UnitST, HIP09UnitST, HIP09UnitST, HIP09UnitST,
++                                                              HIP09UnitST, HIP09UnitST, HIP09UnitST, HIP09UnitST,
++                                                              HIP09UnitST, HIP09UnitST, HIP09UnitST, HIP09UnitST,
++                                                              HIP09UnitST, HIP09UnitST, HIP09UnitST, HIP09UnitST,
++                                                              HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD,
++                                                              HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD,
++                                                              HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD,
++                                                              HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD]>
++{
++  let Latency = 8;
++  let NumMicroOps = 48;
++}
++
++def HIP09Write_10cyc_6FSU_6ST_6STD        : SchedWriteRes<[HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU,
++                                                              HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU,
++                                                              HIP09UnitST, HIP09UnitST, HIP09UnitST,
++                                                              HIP09UnitST, HIP09UnitST, HIP09UnitST,
++                                                              HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD,
++                                                              HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD]>
++{
++  let Latency = 10;
++  let NumMicroOps = 18;
++}
++
++def HIP09Write_10cyc_8FSU_8ST_8STD        : SchedWriteRes<[HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU,
++                                                              HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU, HIP09UnitFSU,
++                                                              HIP09UnitST, HIP09UnitST, HIP09UnitST, HIP09UnitST,
++                                                              HIP09UnitST, HIP09UnitST, HIP09UnitST, HIP09UnitST,
++                                                              HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD,
++                                                              HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD, HIP09UnitSTD]>
++{
++  let Latency = 10;
++  let NumMicroOps = 24;
++}
++
++//===----------------------------------------------------------------------===//
++// Map the target-defined scheduler read/write resources and latency for HIP09.
++// The aliases are sufficient for creating a coarse, working model. As the model
++// evolves, InstRWs will be used to override some of these SchedAliases.
++//
++// WARNING: Using SchedAliases is convenient and works well for latency and
++//          resource lookup for instructions. However, this creates an entry in
++//          AArch64WriteLatencyTable with a WriteResourceID of 0, breaking
++//          any SchedReadAdvance since the lookup will fail.
++
++def : SchedAlias<WriteVd,     HIP09Write_2cyc_1FSU>;
++def : SchedAlias<WriteVq,     HIP09Write_2cyc_1FSU>;
++def : SchedAlias<WriteVLD,    HIP09Write_5cyc_1LD>;
++def : SchedAlias<WriteVST,    HIP09Write_1cyc_1ST>;
++
++def : WriteRes<WriteAtomic,   []>                 { let Unsupported = 1; }
++def : WriteRes<WriteBarrier,  []>                 { let Latency = 1; }
++def : WriteRes<WriteHint,     []>                 { let Latency = 1; }
++def : WriteRes<WriteLDHi,     []>                 { let Latency = 4; }
++def : WriteRes<WriteIM32,     [HIP09UnitALUM]>    { let Latency = 3; }
++def : WriteRes<WriteIM64,     [HIP09UnitALUM]>    { let Latency = 4; }
++
++// Forwarding logic is only modeled for multiply and accumulate.
++def : ReadAdvance<ReadI,        0>;
++def : ReadAdvance<ReadISReg,    0>;
++def : ReadAdvance<ReadIEReg,    0>;
++def : ReadAdvance<ReadIM,       0>;
++def : ReadAdvance<ReadIMA,      2, [WriteIM32, WriteIM64]>;
++def : ReadAdvance<ReadID,       0>;
++def : ReadAdvance<ReadExtrHi,   0>;
++def : ReadAdvance<ReadAdrBase,  0>;
++def : ReadAdvance<ReadVLD,      0>;
++def : ReadAdvance<ReadST,       0>;
++
++
++//===----------------------------------------------------------------------===//
++// Specialize the coarse model by associating instruction groups with the
++// subtarget-defined types. As the model is refined, this will override most
++// of the above SchedAlias mappings.
++
++//Miscellaneous
++// -----------------------------------------------------------------------------
++
++def : InstRW<[WriteI],                                          (instrs COPY)>;
++
++// Branch Instructions
++// -----------------------------------------------------------------------------
++
++def : SchedAlias<WriteBr,                                       HIP09Write_1cyc_1BRU>;
++def : SchedAlias<WriteBrReg,                                    HIP09Write_1cyc_1BRU>;
++
++// Branch, immed
++def : InstRW<[HIP09Write_1cyc_1BRU],                            (instrs B, Bcc)>;
++
++// Branch, register
++// Compare and branch
++def : InstRW<[HIP09Write_1cyc_1BRU],                            (instregex "^(BR|RET|(CBZ|CBNZ|TBZ|TBNZ))$")>;
++
++// Branch and link, immed
++// Branch and link, register
++def : InstRW<[HIP09Write_2cyc_1BRU_1ALUS23],                    (instrs BL, BLR)>;
++
++// Arithmetic and Logical Instructions
++// -----------------------------------------------------------------------------
++def : SchedAlias<WriteI,                                        HIP09Write_1cyc_1ALUS>;
++def : SchedAlias<WriteISReg,                                    HIP09Write_2cyc_1ALUM>;
++def : SchedAlias<WriteIEReg,                                    HIP09Write_2cyc_1ALUM>;
++
++// Convert floating-point condition flags
++// Flag manipulation instructions
++def : WriteRes<WriteSys, []>                                    { let Latency = 1; }
++
++// ALU, basic
++def : InstRW<[HIP09Write_1cyc_1ALUS],                           (instregex "^(ADD|AND|EOR|ORR|SUB)[WX]r(r|i)$")>;
++def : InstRW<[HIP09Write_1cyc_1ALUS],                           (instregex "^(ADC|SBC)[WX]r$")>;
++def : InstRW<[HIP09Write_1cyc_1ALUS],                           (instregex "^(BIC|EON|ORN)[WX]rr$")>;
++
++// ALU, basic, flagset
++def : InstRW<[HIP09Write_1cyc_1ALUS23],                         (instregex "^(ADD|AND|SUB)S[WX]r(r|i)$")>;
++def : InstRW<[HIP09Write_1cyc_1ALUS23],                         (instregex "^(ADC|SBC)S[WX]r$")>;
++def : InstRW<[HIP09Write_1cyc_1ALUS23],                         (instregex "^BICS[WX]rr$")>;
++
++// Shifted Register with Shift == 0
++def HIP09WriteISReg : SchedWriteVariant<[
++      SchedVar<RegShiftedPred,                                  [WriteISReg]>,
++      SchedVar<NoSchedPred,                                     [WriteI]>]>;
++def : InstRW<[HIP09WriteISReg],                                 (instregex "^(ADD|AND|EON|EOR|ORN|ORR|SUB)[WX]rs$")>;
++
++def HIP09WrISReg23 : SchedWriteVariant<[
++      SchedVar<RegShiftedPred,                                  [WriteIEReg]>,
++      SchedVar<NoSchedPred,                                     [HIP09Write_1cyc_1ALUS23]>]>;
++def : InstRW<[HIP09WrISReg23],                                  (instregex "^(ADD|AND|BIC|SUB)S[WX]rs$")>;
++
++// Extended Register with Extend == 0
++def HIP09WrIEReg : SchedWriteVariant<[
++      SchedVar<RegExtendedPred,                                 [WriteIEReg]>,
++      SchedVar<NoSchedPred,                                     [WriteI]>]>;
++def : InstRW<[HIP09WrIEReg],                                    (instregex "^(ADD|SUB)[WX]r(x|x64)$")>;
++
++def HIP09WrIEReg23 : SchedWriteVariant<[
++      SchedVar<RegExtendedPred,                                 [WriteISReg]>,
++      SchedVar<NoSchedPred,                                     [HIP09Write_1cyc_1ALUS23]>]>;
++def : InstRW<[HIP09WrIEReg23],                                  (instregex "^(ADD|SUB)S[WX]r(x|x64)$")>;
++
++// Conditional compare
++def : InstRW<[HIP09Write_1cyc_1ALUS23],                         (instregex "^(CCMN|CCMP)[WX](r|i)$")>;
++
++// Conditional select
++def : InstRW<[HIP09Write_1cyc_1ALUS],                           (instregex "^(CSEL|CSINC|CSINV|CSNEG)[WX]r$")>;
++
++//Convert floating-point condition flags
++def : InstRW<[HIP09Write_1cyc_1ALUS],                           (instregex "^(AX|XA)FLAG$")>;
++
++// Flag manipulation instructions
++def : InstRW<[HIP09Write_1cyc_1ALUS],                           (instrs SETF8, SETF16, RMIF, CFINV)>;
++
++// Logical, shift no flagset
++def : InstRW<[HIP09Write_1cyc_1ALUS],                           (instregex "^BIC[WX]rs$")>;
++
++// Divide and Multiply Instructions
++// -----------------------------------------------------------------------------
++
++def : SchedAlias<WriteID32,                                     HIP09Write_12cyc_1ALUM0_12RC>;
++def : SchedAlias<WriteID64,                                     HIP09Write_20cyc_1ALUM0_20RC>;
++
++//Divide, W-form
++def : InstRW<[HIP09Write_12cyc_1ALUM0_12RC],                    (instregex "^(S|U)DIVWr$")>;
++
++//Divide, X-form
++def : InstRW<[HIP09Write_20cyc_1ALUM0_20RC],                    (instregex "^(S|U)DIVXr$")>;
++
++// Multiply, W-form
++// Multiply accumulate, W-form
++def HIP09ReadMAW : SchedReadAdvance<2,                          [HIP09Write_2cyc_1ALUM]>;
++def : InstRW<[HIP09Write_2cyc_1ALUM, HIP09ReadMAW],             (instrs MADDWrrr, MSUBWrrr)>;
++
++// Multiply, x-form
++// Multiply accumulate, X-form
++def HIP09ReadMAQ : SchedReadAdvance<3,                          [HIP09Write_3cyc_1ALUM]>;
++def : InstRW<[HIP09Write_3cyc_1ALUM, HIP09ReadMAQ],             (instrs MADDXrrr, MSUBXrrr)>;
++
++// Multiply accumulate long
++// Multiply long
++def : InstRW<[HIP09Write_2cyc_1ALUM, HIP09ReadMAW],             (instregex "(S|U)(MADDL|MSUBL)rrr")>;
++
++// Multiply high
++def : InstRW<[HIP09Write_3cyc_1ALUM],                           (instregex "^(S|U)MULHrr$")>;
++
++//Pointer Authentication Instructions
++// -----------------------------------------------------------------------------
++
++// Bitfield move, basic
++def : SchedAlias<WriteIS,                                       HIP09Write_1cyc_1FSU>;
++
++// Authenticate data address
++def : InstRW<[HIP09Write_5cyc_1ALUM1],                          (instregex "^AUTDZ?[AB]$")>;
++
++// Authenticate instruction address
++def : InstRW<[HIP09Write_5cyc_1ALUM1],                          (instregex "^AUTI[AB](1716|SP|Z)?$", "^AUTIZ[AB]$")>;
++
++// Branch and link, register, with pointer authentication
++def : InstRW<[HIP09Write_6cyc_1BRU_1ALUM1],                     (instregex "^BLRA[AB]Z?$")>;
++
++// Branch, register, with pointer authentication
++def : InstRW<[HIP09Write_6cyc_1BRU_1ALUM1],                     (instregex "^BRA[AB]Z?$")>;
++
++// Branch, return, with pointer authentication
++def : InstRW<[HIP09Write_6cyc_1BRU_1ALUM1],                     (instregex "^RETA[AB]$")>;
++
++// Compute pointer authentication code for data address
++def : InstRW<[HIP09Write_5cyc_1ALUM1],                          (instregex "^PACDZ?[AB]$")>;
++
++// Compute pointer authentication code, using generic key
++def : InstRW<[HIP09Write_5cyc_1ALUM1],                          (instrs PACGA)>;
++
++// Compute pointer authentication code for instruction address
++def : InstRW<[HIP09Write_5cyc_1ALUM1],                          (instregex "^PACI[AB](1716|SP|Z)?$", "^PACIZ[AB]$")>;
++
++// Load register, with pointer authentication
++def : InstRW<[HIP09Write_9cyc_1LD_1ALUM1],                      (instregex "^LDRA[AB](indexed|writeback)$")>;
++
++// Strip pointer authentication code
++def : InstRW<[HIP09Write_1cyc_1ALUS1],                          (instrs XPACD, XPACI, XPACLRI)>;
++
++// Exception return, with pointer authentication
++def : InstRW<[HIP09Write_5cyc_1ALUM1],                          (instregex "^ERETA[AB]$")>;
++
++// Load Instructions
++// -----------------------------------------------------------------------------
++
++def : WriteRes<WriteLD,     [HIP09UnitLD]>                      { let Latency = 4; }
++def : WriteRes<WriteLDIdx,  [HIP09UnitLD]>                      { let Latency = 4; }
++
++// Pre/Post Indexing
++def : WriteRes<WriteAdr,    [HIP09UnitALUS]>                    { let Latency = 1; }
++
++// Load register, literal
++def : InstRW<[HIP09Write_4cyc_1LD],                             (instregex "^LDR(W|X)l$")>;
++def : InstRW<[HIP09Write_4cyc_1LD],                             (instrs LDRSWl)>;
++def : InstRW<[HIP09Write_4cyc_1LD],                             (instrs PRFMl)>;
++
++// Load register, unscaled immed
++def : InstRW<[HIP09Write_4cyc_1LD],                             (instregex "^LDUR(W|X|BB|HH)i$")>;
++def : InstRW<[HIP09Write_4cyc_1LD],                             (instregex "^LDURS(BW|BX|HW|HX|W)i$")>;
++def : InstRW<[HIP09Write_4cyc_1LD],                             (instrs PRFUMi)>;
++
++// Load register, immed post-index
++// Load register, immed pre-index
++def : InstRW<[WriteAdr, HIP09Write_4cyc_1LD],                   (instregex "^LDR(BB|HH|W|X)(post|pre)$")>;
++def : InstRW<[WriteAdr, HIP09Write_4cyc_1LD],                   (instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>;
++
++// Load register, immed unprivileged
++def : InstRW<[HIP09Write_4cyc_1LD],                             (instregex "^LDTR(W|X|B|H)i$")>;
++def : InstRW<[HIP09Write_4cyc_1LD],                             (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>;
++
++// Load register, unsigned immed
++def : InstRW<[HIP09Write_4cyc_1LD],                             (instregex "^LDR(W|X|BB|HH)ui$")>;
++def : InstRW<[HIP09Write_4cyc_1LD],                             (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>;
++def : InstRW<[HIP09Write_4cyc_1LD],                             (instrs PRFMui)>;
++
++// Load register, register offset
++def : InstRW<[HIP09Write_5cyc_1LD],                             (instregex "^LDR(W|X|BB)ro(W|X)$")>;
++def : InstRW<[HIP09Write_5cyc_1LD],                             (instregex "^LDRS(BW|BX|W)ro(W|X)$")>;
++def : InstRW<[HIP09Write_5cyc_1LD],                             (instregex "^PRFMro(W|X)$")>;
++
++// Load register, register offset, extend, scale by 2
++def : InstRW<[HIP09Write_6cyc_1LD_1ALUS],                       (instregex "^LDR(HH|SHW|SHX)ro(W|X)$")>;
++
++// Load pair, immed offset
++def : InstRW<[HIP09Write_4cyc_1LD, WriteLDHi],                  (instregex "^LDP(W|X|SW)i$")>;
++def : InstRW<[HIP09Write_4cyc_1LD, WriteLDHi],                  (instregex "^LDNP(W|X)i$")>;
++
++// Load pair, immed post-index
++def : InstRW<[WriteAdr, HIP09Write_4cyc_1LD_1ALUS, WriteLDHi],  (instregex "^LDP[WX]post$")>;
++def : InstRW<[WriteAdr, HIP09Write_4cyc_1LD_1ALUS, WriteLDHi],  (instrs LDPSWpost)>;
++
++// Load pair, immed pre-index
++def : InstRW<[WriteAdr, HIP09Write_4cyc_1LD_1ALUS, WriteLDHi],  (instregex "^LDP[WX]pre$")>;
++def : InstRW<[WriteAdr, HIP09Write_4cyc_1LD_1ALUS, WriteLDHi],  (instrs LDPSWpre)>;
++
++// Miscellaneous Data-Processing Instructions
++// -----------------------------------------------------------------------------
++
++def : SchedAlias<WriteImm,                                      HIP09Write_1cyc_1ALUS>;
++def : SchedAlias<WriteExtr,                                     HIP09Write_1cyc_1ALUS>;
++
++// Address generation
++def : InstRW<[HIP09Write_1cyc_1ALUS23],                         (instrs ADR, ADRP)>;
++
++// Bitfield extract, one reg
++// Bitfield extract, two reg
++def : InstRW<[HIP09Write_1cyc_1ALUS],                           (instregex "^EXTR(W|X)rri$")>;
++
++// Bitfield move, basic
++// Bitfield move, insert
++def : InstRW<[HIP09Write_1cyc_1ALUS],                           (instregex "^(S|U)?BFM(W|X)ri$")>;
++
++// Move immed
++def : InstRW<[HIP09Write_1cyc_1ALUS],                           (instregex "^MOV[NZK][WX]i$")>;
++
++// Count leading
++// Reverse bit/bytes
++def : InstRW<[HIP09Write_1cyc_1ALUS],                           (instregex "^(CLS|CLZ|RBIT|REV(16|32)?)(W|X)r$")>;
++
++// Variable shift
++def : InstRW<[HIP09Write_1cyc_1ALUS],                           (instregex "^(ASRV|LSLV|LSRV|RORV)(W|X)r$")>;
++
++// Store instructions
++// -----------------------------------------------------------------------------
++def : WriteRes<WriteST,     [HIP09UnitST]>                      { let Latency = 1; }
++def : WriteRes<WriteSTP,    [HIP09UnitST]>                      { let Latency = 1; }
++def : WriteRes<WriteSTIdx,  [HIP09UnitST]>                      { let Latency = 1; }
++
++// Store register, unscaled immed
++def : InstRW<[HIP09Write_1cyc_1ST_1ALUM],                       (instregex "^STUR(BB|HH|W|X)i$")>;
++
++// Store register, immed post-index
++// Store register, immed pre-index
++def : InstRW<[WriteAdr, HIP09Write_1cyc_1ST_1ALUS_1ALUM],       (instregex "^STR(BB|HH|W|X)(post|pre)$")>;
++
++// Store register, immed unprivileged
++def : InstRW<[HIP09Write_1cyc_1ST_1ALUM],                       (instregex "^STTR(B|H|W|X)i$")>;
++
++// Store register, unsigned immed
++def : InstRW<[HIP09Write_1cyc_1ST_1ALUM],                       (instregex "^STR(BB|HH|W|X)ui$")>;
++
++// Store register, register offset
++def : InstRW<[HIP09Write_2cyc_1ST_1ALUM],                       (instregex "^STR(BB|W|X)ro(W|X)$")>;
++
++// Store register offset, no-extend, scaled by 2
++// def : InstRW<[HIP09Write_3cyc_1ST_1ALUS_1ALUM],                 (instregex "^STRHHro(W|X)$")>;
++
++// Store pair, immed offset
++def : InstRW<[HIP09Write_1cyc_1ST_1ALUM],                       (instregex "^STN?P(W|X)i$")>;
++
++// Store pair, immed post-index
++// Store pair, immed pre-index
++def : InstRW<[WriteAdr, HIP09Write_1cyc_1ST_1ALUS_1ALUM],       (instregex "^STP(W|X)(post|pre)$")>;
++
++// FP data processing instructions
++// -----------------------------------------------------------------------------
++
++def : SchedAlias<WriteF,                                        HIP09Write_1cyc_1FSU>;
++def : SchedAlias<WriteFCmp,                                     HIP09Write_4cyc_1FSU>;
++def : SchedAlias<WriteFDiv,                                     HIP09Write_10cyc_1FSU_6RC>;
++
++// FP absolute value
++// FP negate
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^(FABS|FNEG)[HSD]r$")>;
++
++// FP absolute value
++// def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^FABD$")>;
++
++// FP compare
++def : InstRW<[HIP09Write_4cyc_1FSU_1ALUS23],                    (instregex "^FCMPE?[HSD]r[ri]$")>;
++
++// FP conditional compare
++def : InstRW<[HIP09Write_7cyc_1ALUS01_1FSU_1ALUS23],            (instregex "^FCCMPE?[HSD]rr$")>;
++
++// FP conditional select
++def : InstRW<[HIP09Write_4cyc_1ALUS01_1FSU],                    (instregex "^FCSEL[HSD]rrr$")>;
++
++// FP divide, H-form
++def : InstRW<[HIP09Write_7cyc_1FSU_3RC],                        (instrs FDIVHrr)>;
++// FP divide, S-form
++def : InstRW<[HIP09Write_7cyc_1FSU_3RC],                        (instrs FDIVSrr)>;
++// FP divide, D-form
++def : InstRW<[HIP09Write_10cyc_1FSU_6RC],                       (instrs FDIVDrr)>;
++
++// FP square root, H-form
++def : InstRW<[HIP09Write_7cyc_1FSU_3RC],                        (instrs FSQRTHr)>;
++// FP square root, S-form
++def : InstRW<[HIP09Write_9cyc_1FSU_5RC],                        (instrs FSQRTSr)>;
++// FP square root, D-form
++def : InstRW<[HIP09Write_15cyc_1FSU_11RC],                      (instrs FSQRTDr)>;
++
++// FP fused multiply-add
++def : InstRW<[HIP09Write_4cyc_1FSU],                            (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
++
++// FP max/min
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^FM(AX|IN)(NM)?[HSD]rr$")>;
++
++// FP add
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^F(ADD|SUB)[HSD]rr")>;
++
++//FP multiply
++def : WriteRes<WriteFMul, [HIP09UnitFSU]>                       { let Latency = 3; }
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^FN?MUL[HSD]rr")>;
++
++// FP round to FP integral
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^FRINT[AIMNPXZ][HSD]r$",
++                                                                                "^FRINT(32|64)[XZ][SD]r$")>;
++
++// FP convert to FP
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^FCVT(HD|SD|DH|SH|DS|HS)r")>;
++
++// FP miscellaneous instructions
++// -----------------------------------------------------------------------------
++
++def : SchedAlias<WriteFImm,                                     HIP09Write_1cyc_1FSU>;
++
++
++// FP convert, from vec to vec reg
++def : SchedAlias<WriteFCvt,                                     HIP09Write_5cyc_1ALUS01_1FSU>;
++
++// Integer/ Fixed point convert to FP
++def : InstRW<[HIP09Write_5cyc_1ALUS01_1FSU],                    (instregex "^[SU]CVTF[SU][WX][SHD]ri")>;
++
++// FP convert, from vec to gen reg
++def : InstRW<[HIP09Write_5cyc_1FSU_1ALUS23],                    (instregex "^FCVT(A|M|N|P)(S|U)U(W|X)(S|D|H)r$")>;
++def : InstRW<[HIP09Write_5cyc_1FSU_1ALUS23],                    (instregex "^FCVTZ[SU][SU][WX](S|D|H)ri?$")>;
++
++// FP convert, Javascript from to gen reg
++def : InstRW<[HIP09Write_5cyc_1FSU_1ALUS23],                    (instrs FJCVTZS)>;
++
++// FP move, immed
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^FMOV[HSD]i$")>;
++
++// FP move, register
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^FMOV[HSD]r$")>;
++
++// FP transfer, from gen to low half of vec reg
++def : InstRW<[HIP09Write_3cyc_1ALUS01],                         (instrs FMOVWHr, FMOVXHr, FMOVWSr, FMOVXDr,
++                                                                            FMOVHWr, FMOVHXr, FMOVSWr, FMOVDXr)>;
++
++// FP transfer, from gen to high half of vec reg
++def : InstRW<[HIP09Write_4cyc_1ALUS01_1FSU],                    (instrs FMOVXDHighr)>;
++
++//FP transfer, from vec to gen reg
++def : SchedAlias<WriteFCopy,                                    HIP09Write_1cyc_2FSU>;
++
++// FP load instructions
++// -----------------------------------------------------------------------------
++
++// Load vector reg, literal
++def : InstRW<[HIP09Write_6cyc_1LD],                             (instregex "^LDR[SDQ]l$")>;
++
++// Load vector reg, unscaled immed
++def : InstRW<[HIP09Write_6cyc_1LD],                             (instregex "^LDUR[BHSDQ]i")>;
++
++// Load vector reg, immed post-index
++// Load vector reg, immed pre-index
++def : InstRW<[WriteAdr, HIP09Write_6cyc_1LD],                   (instregex "^LDR[BHSDQ](post|pre)")>;
++
++// Load vector reg, unsigned immed
++def : InstRW<[HIP09Write_6cyc_1LD],                             (instregex "^LDR[BHSDQ]ui")>;
++
++// Load vector reg, register offset
++def : InstRW<[HIP09Write_6cyc_1LD, ReadAdrBase],                (instregex "^LDR[BHSDQ]ro(W|X)$")>;
++
++// Load vector pair, immed offset
++def : InstRW<[HIP09Write_6cyc_1LD, WriteLDHi],                  (instregex "^LDN?P[SDQ]i$")>;
++
++// Load vector pair, immed post-index
++// Load vector pair, immed pre-index
++def : InstRW<[WriteAdr, HIP09Write_6cyc_1LD, WriteLDHi],        (instregex "^LDP[SDQ](post|pre)$")>;
++
++// FP store instructions
++// -----------------------------------------------------------------------------
++
++//Store vector reg, unscaled immed
++def : InstRW<[HIP09Write_1cyc_1ST_1STD],                        (instregex "^STUR[BHSDQ]i$")>;
++
++// Store vector reg, immed post-index
++// Store vector reg, immed pre-index
++def : InstRW<[HIP09Write_1cyc_1ST_1STD_1ALUS, ReadAdrBase],     (instregex "^STR[BHSDQ](post|pre)$")>;
++
++// Store vector reg, immed unprivileged
++// Store vector reg, unsigned immed
++def : InstRW<[HIP09Write_1cyc_1ST_1STD],                        (instregex "^STR[BHSDQ]ui$")>;
++
++// Store vector reg, reg offset, no-extend
++// Store vector reg, reg offset, extend
++def : InstRW<[HIP09Write_2cyc_1ST_1STD_1ALUS, ReadAdrBase],     (instregex "^STR[BHSDQ]ro[WX]$")>;
++
++// Store vector pair, immed offset
++def : InstRW<[HIP09Write_1cyc_1ST_1STD],                        (instregex "^STN?P[SD]i$")>;
++
++// Store vector pair, immed offset
++def : InstRW<[HIP09Write_2cyc_2ST_2STD],                        (instregex "^STN?PQi$")>;
++
++// Store vector pair, immed post-index
++// Store vector pair, immed pre-index
++def : InstRW<[WriteAdr, HIP09Write_1cyc_1ST_1STD_1ALUS],        (instregex "^STP[SD](post|pre)$")>;
++def : InstRW<[WriteAdr, HIP09Write_2cyc_2ST_2STD_2ALUS],        (instregex "^STPQ(post|pre)$")>;
++
++// ASIMD integer Instructions
++// -----------------------------------------------------------------------------
++
++// ASIMD absolute diff
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^[SU]ABDv")>;
++
++// ASIMD absolute diff accum
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^[SU]ABAL?v")>;
++
++// ASIMD arith, basicc
++// ASIMD arith wide
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^(ABS|NEG|ADD|SUB)v")>;
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^[SU]ADD(L|W)v")>;
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^[SU]SUB[LW]v")>;
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^(SH|UH)(ADD|SUB)v")>;
++
++// Integer SIMD complex arithmetic
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^(SU|US)QADDv")>;
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^SQ(ABS|NEG)v")>;
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^(SQ|UQ)(ADD|SUB)v")>;
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^(ADD|SUB)HNv")>;
++
++// Integer SIMD complex arithmetic
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^R(ADD|SUB)HNv")>;
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^[SU]RHADDv")>;
++
++// ASIMD arith, pair-wise
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^ADDPv")>;
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^[SU]ADDLPv")>;
++
++// ASIMD arith, reduce
++def : InstRW<[HIP09Write_3cyc_2FSU],                            (instregex "^(ADDV|[SU]ADDLV)v")>;
++
++// ASIMD compare
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^CM(GT|EQ|GE|LT|LE|TST|HI|HS)v")>;
++
++// ASIMD dot product
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^[SU]DOT(lane)?(v8|v16)i8$")>;
++
++// ASIMD dot product using signed and unsigned integers
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^(SU|US)DOT(lane)?(v8|v16)i8$")>;
++
++// ASIMD logical
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^(AND|NOT|ORN|ORR|BIC|EOR)v")>;
++
++// ASIMD matrix multiply-accumulate
++def : InstRW<[HIP09Write_4cyc_4FSU],                            (instregex "^(S|U|US)MMLA$")>;
++
++// ASIMD max,min
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^[SU](MAX|MIN)v")>;
++
++// ASIMD max/min pair-wise
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^[SU](MAX|MIN)Pv")>;
++
++// ASIMD max/min, reduce, S form
++def : InstRW<[HIP09Write_3cyc_2FSU],                            (instregex "^[SU](MAX|MIN)V(v4|v2)i32v$")>;
++
++// ASIMD max/min, reduce,  B/H form
++def : InstRW<[HIP09Write_4cyc_2FSU],                            (instregex "^[SU](MAX|MIN)V(v4i16|v8i8|v8i16|v16i8)v$")>;
++
++// Integer SIMD multiply(accumulate), B form
++def : InstRW<[HIP09Write_2cyc_2FSU],                            (instregex "^M(UL|LA|LS)(v8|v16)i8$")>;
++
++// Integer SIMD multiply(accumulate), H/S form
++def : InstRW<[HIP09Write_3cyc_2FSU],                            (instregex "^M(UL|LA|LS)(v4i16|v8i16|v4i32|v2i32)(_indexed)?$",
++                                                                              "^SQR?DMULH(v4|v8|v1)i16(_indexed)?$",
++                                                                              "^SQR?DMULH(v4|v2|v1)i32(_indexed)?$")>;
++// ASIMD multiply accumulate high, H/S form
++def : InstRW<[HIP09Write_3cyc_2FSU],                            (instregex "^SQRDML[AS]H(v4|v8|v1)?i16(_indexed)?$",
++                                                                              "^SQRDML[AS]H(v4|v2|v1)?i32(_indexed)?$")>;
++
++// ASIMD multiply(accumulate) long B form
++def : InstRW<[HIP09Write_2cyc_2FSU],                            (instregex "^[SU]M(LA|LS|UL)L(v8|v16)i8_v8i16$")>;
++
++// Integer SIMD multiply(accumulate) long H/S form
++def : InstRW<[HIP09Write_3cyc_2FSU],                            (instregex "^(S|U|SQD)M(LA|LS|UL)L(v4|v8)i16",
++                                                                                "^(S|U|SQD)M(LA|LS|UL)L(v2|v4)i32",
++                                                                                "^SQDM(LA|LS|UL)L(i16|i32)$",
++                                                                                "^SQDM(LA|LS|UL)Lv1(i32|i64)_indexed$")>;
++
++// ASIMD multiply/multiply long (8x8) polynomial
++def : InstRW<[HIP09Write_2cyc_1FSU2],                           (instregex "^PMULL?(v8i8|v16i8)$")>;
++
++// ASIMD pairwise add and accumulate long
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^[SU]ADALPv")>;
++
++// ASIMD shift accumulate
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^[SU]R?SRA(d|v)")>;
++
++// ASIMD shift by immed, basic
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^SHL(v|d)", "^SH(LL|RN)v",
++                                                                                "^[SU]SHLLv", "^[SU]SHR(d|v)")>;
++
++// ASIMD shift by immed and insert, basic
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "SLI(d|v)", "^SRI(d|v)")>;
++
++// ASIMD shift by immed, complex
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^RSHRNv", "^[SU]QRSHRU?N(b|h|s|v)",
++                                                                                "^[SU]RSHR(d|v)")>;
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^SQSHLU(b|h|s|d|v)", "^[SU]QSHRU?N(b|h|s|v)")>;
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^[SU]QSHL(b|h|s|d|v)")>;
++
++// ASIMD shift by register, basic
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^[SU]SHLv")>;
++
++// ASIMD shift by immed, complex
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^[SU]QRSHLv", "^[SU]RSHL(d|v)")>;
++
++// ASIMD floating-point instructions
++// -----------------------------------------------------------------------------
++
++// Reference for forms in this group
++//   D form - v2f32
++//   Q form - v4f32, v2f64
++//   D form - 32, 64
++//   D form - v1i32, v1i64
++//   D form - v2i32
++//   Q form - v4i32, v2i64
++
++// FP SIMD sign manipulation
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^F(ABS|NEG)v")>;
++
++// ASIMD FP absolute difference
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^FABDv")>;
++
++// ASIMD FP arith
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^F(ADD|SUB)v")>;
++
++// ASIMD FP add pairwise
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^FADDPv")>;
++
++// ASIMD FP compare
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^FACG[ET]v")>;
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "FCM(EQ|GE|GT|LE|LT)v")>;
++
++// ASIMD FP convert long
++def : InstRW<[HIP09Write_3cyc_2FSU],                            (instregex "^FCVTLv")>;
++
++// ASIMD FP convert narrow
++def : InstRW<[HIP09Write_3cyc_2FSU],                            (instregex "^FCVTX?Nv")>;
++
++// ASIMD FP convert to Integer/Fixed point, D-form
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^FCVT[NMAPZ][SU](v4f16|v2f32|v1f16|v1i64|v1i32)")>;
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^FCVTZ[SU](h|s|v4i16_shift|v2i32_shift)")>;
++
++// ASIMD FP convert to Integer/Fixed point, Q-form
++def : InstRW<[HIP09Write_3cyc_2FSU],                            (instregex "^FCVT[NMAPZ][SU](v8f16|v4f32|v2f64)")>;
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^FCVTZ[SU](d|v4i32_shift|v2i64_shift)")>;
++
++// ASIMD FP convert from Integer/Fixed-point to FP, Q-form
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^[SU]CVTF(h|s|v4f16|v2f32|v1i64|v1i32|v1i16|v4i16_shift|v2i32_shift)$")>;
++
++// ASIMD FP convert from Integer/Fixed-point to FP, Q-form
++def : InstRW<[HIP09Write_3cyc_2FSU],                            (instregex "^[SU]CVTF(d|v8f16|v4f32|v2f64|v8i16_shift|v4i32_shift|v2i64_shift)$")>;
++
++// ASIMD FP divide, D-form, F16
++def : InstRW<[HIP09Write_9cyc_1FSU_5RC],                        (instregex "^FDIVv4f16$")>;
++
++// ASIMD FP divide, D-form, F32
++def : InstRW<[HIP09Write_9cyc_1FSU_5RC],                        (instregex "^FDIVv2f32$")>;
++
++// ASIMD FP divide, Q-form, F16
++def : InstRW<[HIP09Write_13cyc_1FSU_9RC],                       (instregex "^FDIVv8f16$")>;
++
++// ASIMD FP divide, Q-form, F32
++def : InstRW<[HIP09Write_13cyc_1FSU_9RC],                       (instregex "^FDIVv4f32$")>;
++
++// ASIMD FP divide, Q-form, F64
++def : InstRW<[HIP09Write_15cyc_1FSU_11RC],                      (instregex "^FDIVv2f64$")>;
++
++// ASIMD FP square root, D-form, F16
++def : InstRW<[HIP09Write_13cyc_1FSU_9RC],                       (instregex "^FSQRTv4f16$")>;
++
++// ASIMD FP square root, D-form, F32
++def : InstRW<[HIP09Write_13cyc_1FSU_9RC],                       (instregex "^FSQRTv2f32$")>;
++
++// ASIMD FP square root, Q-form, F16
++def : InstRW<[HIP09Write_21cyc_1FSU_17RC],                      (instregex "^FSQRTv8f16$")>;
++
++// ASIMD FP square root, Q-form, F32
++def : InstRW<[HIP09Write_21cyc_1FSU_17RC],                      (instregex "^FSQRTv4f32$")>;
++
++// ASIMD FP square root, Q-form, F64
++def : InstRW<[HIP09Write_25cyc_1FSU_21RC],                      (instregex "^FSQRTv2f64$")>;
++
++// ASIMD FP max/min, pairwise
++def : InstRW<[HIP09Write_2cyc_1FSU],                            (instregex "^F(MAX|MIN)(NM)?v")>;
++
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^F(MAX|MIN)(NM)?Pv")>;
++
++// FP SIMD max,min reduce HP-form
++def : InstRW<[HIP09Write_3cyc_2FSU],                            (instregex "^F(MAX|MIN)(NM)?V(v4|v8)i16v")>;
++
++// FP SIMD max,min reduce SP/DP-form
++def : InstRW<[HIP09Write_2cyc_2FSU],                            (instregex "^F(MAX|MIN)(NM)?Vv4i32v")>;
++
++// ASIMD FP multiply
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^FMULX?v")>;
++
++// ASIMD FP fused multiply-add
++def : InstRW<[HIP09Write_4cyc_1FSU],                            (instregex "^FML[AS]v")>;
++
++// ASIMD FP fused multiply-add long
++def : InstRW<[HIP09Write_5cyc_1FSU],                            (instregex "^FML[AS]L2?v")>;
++
++// ASIMD FP round to FP integral, D-form
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^FRINT(N|M|P|Z|A|X|I)(v4f16|v2f32)")>;
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^FRINT(32|64)[ZX]v2f32")>;
++
++// ASIMD FP round to FP integral, Q-form
++def : InstRW<[HIP09Write_3cyc_2FSU],                            (instregex "^FRINT(N|M|P|Z|A|X|I)(v8f16|v4f32|v2f64)")>;
++def : InstRW<[HIP09Write_3cyc_2FSU],                            (instregex "^FRINT(32|64)[ZX](v4f32|v2f64)")>;
++
++// ASIMD Bfloat16 (BF16) Instructions
++// -----------------------------------------------------------------------------
++
++// ASIMD convert, F32 to BF16
++def : InstRW<[HIP09Write_3cyc_2FSU],                            (instregex "^BFCVTN2?$")>;
++
++// ASIMD dot product
++def : InstRW<[HIP09Write_6cyc_2FSU],                            (instregex "^(BFDOT|BF16DOTlane)v")>;
++
++// ASIMD matrix multiply accumulate
++def : InstRW<[HIP09Write_9cyc_4FSU],                            (instrs BFMMLA)>;
++
++// ASIMD multiply accumulate long
++def : InstRW<[HIP09Write_5cyc_1FSU],                            (instregex "^BFMLAL[BT](Idx)?$")>;
++
++// Scalar convert, F32 to BF16
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instrs BFCVT)>;
++
++// ASIMD Miscellaneous Instructions
++// -----------------------------------------------------------------------------
++
++// Reference for forms in this group
++//   D form - v8i8, v4i16, v2i32
++//   Q form - v16i8, v8i16, v4i32
++//   D form - v1i8, v1i16, v1i32, v1i64
++//   Q form - v16i8, v8i16, v4i32, v2i64
++
++// ASIMD bit reverse
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^RBITv")>;
++
++// ASIMD bitwise insert
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^(BIF|BIT|BSL)v")>;
++
++// ASIMD count
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^(CLS|CLZ)v")>;
++
++// TODO: CNT only supports B element sizes now.
++// ASIMD count, D
++// ASIMD count, B/H/S
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^CNT(v8i8|v16i8)")>;
++
++// ASIMD duplicate, gen reg
++// Integer SIMD complex move general register to FP
++def : InstRW<[HIP09Write_4cyc_1ALUS01_1FSU],                    (instregex "^DUPv.+gpr")>;
++
++// ASIMD duplicate, element
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^DUP(i8|i16|i32|i64)$")>;
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^DUPv.+lane")>;
++
++// ASIMD extract
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^EXTv", "^XTNv")>;
++
++// ASIMD extract narrow, saturating
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^[SU]QXTU?Nv")>;
++
++// ASIMD insert, element to element
++def : InstRW<[HIP09Write_4cyc_1ALUS01_1FSU],                    (instregex "^INSv")>;
++
++// ASIMD FP move, immed
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^FMOVv")>;
++
++// ASIMD move, integer immediate
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^MOVIv", "^MOVID$")>;
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^MVNIv")>;
++
++// ASIMD reciprocal and square root estimate, D-form
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^URECPEv2i32", "^URSQRTEv2i32")>;
++
++// ASIMD reciprocal and square root estimate, Q-form
++def : InstRW<[HIP09Write_3cyc_2FSU],                            (instregex "^URECPEv4i32", "^URSQRTEv4i32")>;
++
++// ASIMD FP reciprocal and square root estimate, D-form
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^(FRECPE|FRSQRTE)(v2f32|v4f16|v1)")>;
++
++// ASIMD FP reciprocal and square root estimate, Q-form
++def : InstRW<[HIP09Write_3cyc_2FSU],                            (instregex "^(FRECPE|FRSQRTE)(v8f16|v4f32|v2f64)")>;
++
++// ASIMD FP reciprocal exponent
++def : InstRW<[HIP09Write_3cyc_1FSU],                            (instregex "^FRECPXv")>;
++
++// ASIMD FP reciprocal step
++def : InstRW<[HIP09Write_4cyc_1FSU],                            (instregex "^FR(ECP|SQRT)S(v|32|64)")>;
++
++// ASIMD reverse
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^REV(16|32|64)v")>;
++
++// ASIMD table lookup, 1 or 2 table RegS
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^TBL(v8|v16)i8(One|Two)$")>;
++
++// ASIMD table lookup, 3 table RegS
++def : InstRW<[HIP09Write_3cyc_2FSU],                            (instregex "^TBL(v8|v16)i8Three$")>;
++
++// ASIMD table lookup, 4 table RegS
++def : InstRW<[HIP09Write_5cyc_1FSU_3RC],                        (instregex "^TBL(v8|v16)i8Four$")>;
++
++// ASIMD table lookup extension, 1 table reg
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^TBX(v8|v16)i8One$")>;
++
++// ASIMD table lookup extension, 2 table reg
++def : InstRW<[HIP09Write_3cyc_2FSU],                            (instregex "^TBX(v8|v16)i8Two$")>;
++
++// ASIMD table lookup extension, 3 table reg
++def : InstRW<[HIP09Write_5cyc_1FSU_3RC],                        (instregex "^TBX(v8|v16)i8Three$")>;
++
++// ASIMD table lookup extension, 4 table reg
++def : InstRW<[HIP09Write_7cyc_4FSU],                            (instregex "^TBX(v8|v16)i8Four$")>;
++
++// ASIMD move FP to general register
++def : InstRW<[HIP09Write_2cyc_2FSU],                            (instregex "^[SU]MOV")>;
++
++// ASIMD transpose
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^TRN[12]v")>;
++
++// ASIMD uzip/zip
++def : InstRW<[HIP09Write_1cyc_1FSU],                            (instregex "^(UZP|ZIP)[12]v")>;
++
++// ASIMD load instructions
++// -----------------------------------------------------------------------------
++
++// SIMD load, 1-element, multiple, 1-reg
++def : InstRW<[HIP09Write_6cyc_1LD],                             (instregex "^LD1Onev(8b|4h|2s|1d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_6cyc_1LD],                   (instregex "^LD1Onev(8b|4h|2s|1d)_POST$")>;
++def : InstRW<[HIP09Write_6cyc_1LD],                             (instregex "^LD1Onev(16b|8h|4s|2d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_6cyc_1LD],                   (instregex "^LD1Onev(16b|8h|4s|2d)_POST$")>;
++
++// SIMD load, 1-element, multiple, 2-reg
++def : InstRW<[HIP09Write_6cyc_2LD],                             (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_6cyc_2LD],                   (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
++def : InstRW<[HIP09Write_6cyc_2LD],                             (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_6cyc_2LD],                   (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
++
++// SIMD load, 1-element, multiple, 3-reg
++def : InstRW<[HIP09Write_6cyc_1LD_3RC],                         (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_6cyc_1LD_3RC],               (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
++def : InstRW<[HIP09Write_6cyc_1LD_3RC],                         (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_6cyc_1LD_3RC],               (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
++
++// SIMD load, 1-element, multiple, 4-reg
++def : InstRW<[HIP09Write_6cyc_1LD_4RC],                         (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_6cyc_1LD_4RC],               (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
++def : InstRW<[HIP09Write_6cyc_1LD_4RC],                         (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_6cyc_1LD_4RC],               (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
++
++
++// SIMD load, 1-element, single, 1 lane
++def : InstRW<[HIP09Write_7cyc_1LD_1FSU],                        (instregex "^LD1i(8|16|32|64)$")>;
++def : InstRW<[WriteAdr, HIP09Write_7cyc_1LD_1FSU],              (instregex "^LD1i(8|16|32|64)_POST$")>;
++
++// SIMD load, 1-element, single, replicate to all lanes
++def : InstRW<[HIP09Write_7cyc_1LD_1FSU],                        (instregex "^LD1Rv(8b|4h|2s|1d)$$")>;
++def : InstRW<[WriteAdr, HIP09Write_7cyc_1LD_1FSU],              (instregex "^LD1Rv(8b|4h|2s|1d)_POST$")>;
++def : InstRW<[HIP09Write_7cyc_1LD_1FSU],                        (instregex "^LD1Rv(16b|8h|4s|2d)$$")>;
++def : InstRW<[WriteAdr, HIP09Write_7cyc_1LD_1FSU],              (instregex "^LD1Rv(16b|8h|4s|2d)_POST$")>;
++
++// SIMD load, 2-elements, multiple, Q-form
++def : InstRW<[HIP09Write_8cyc_2LD_2FSU],                        (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_8cyc_2LD_2FSU],              (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
++
++// SIMD load, 2-elements, multiple, other form
++def : InstRW<[HIP09Write_8cyc_1LD_1FSU],                        (instregex "^LD2Twov(8b|4h|2s)$")>;
++def : InstRW<[WriteAdr, HIP09Write_8cyc_1LD_1FSU],              (instregex "^LD2Twov(8b|4h|2s)_POST$")>;
++
++// SIMD load, 2-element, single, 1 lane
++def : InstRW<[HIP09Write_8cyc_1LD_1FSU],                        (instregex "^LD2i(8|16|32|64)$")>;
++def : InstRW<[WriteAdr, HIP09Write_8cyc_1LD_1FSU],              (instregex "^LD2i(8|16|32|64)_POST$")>;
++
++// SIMD load LD3 (multiple structures)
++def : InstRW<[HIP09Write_9cyc_3LD_3FSU],                        (instregex "^LD3Threev(16b|8h|4s|2d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_9cyc_3LD_3FSU],              (instregex "^LD3Threev(16b|8h|4s|2d)_POST$")>;
++def : InstRW<[HIP09Write_9cyc_3LD_3FSU],                        (instregex "^LD3Threev(8b|4h|2s)$")>;
++def : InstRW<[WriteAdr, HIP09Write_9cyc_3LD_3FSU],              (instregex "^LD3Threev(8b|4h|2s)_POST$")>;
++
++// SIMD load, 3-element, single, 1 lane
++def : InstRW<[HIP09Write_8cyc_2LD_2FSU],                        (instregex "^LD3i(8|16|32|64)$")>;
++def : InstRW<[WriteAdr, HIP09Write_8cyc_2LD_2FSU],              (instregex "^LD3i(8|16|32|64)_POST$")>;
++
++// SIMD load, 4-element, multiple, Q-form
++def : InstRW<[HIP09Write_13cyc_8LD_8FSU],                       (instregex "^LD4Fourv(16b|8h|4s|2d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_13cyc_8LD_8FSU],             (instregex "^LD4Fourv(16b|8h|4s|2d)_POST$")>;
++
++// SIMD load, 4-element, multiple, D-form
++def : InstRW<[HIP09Write_12cyc_8LD_8FSU],                       (instregex "^LD4Fourv(8b|4h|2s)$")>;
++def : InstRW<[WriteAdr, HIP09Write_12cyc_8LD_8FSU],             (instregex "^LD4Fourv(8b|4h|2s)_POST$")>;
++
++// SIMD load LD4 (single structure)
++def : InstRW<[HIP09Write_8cyc_2LD_2FSU],                        (instregex "^LD4i(8|16|32|64)$")>;
++def : InstRW<[WriteAdr, HIP09Write_8cyc_2LD_2FSU],              (instregex "^LD4i(8|16|32|64)_POST$")>;
++
++// SIMD load, 2-element, single, replicate to all lanes
++def : InstRW<[HIP09Write_8cyc_1LD_1FSU],                        (instregex "^LD2Rv(8b|4h|2s|1d)$$")>;
++def : InstRW<[WriteAdr, HIP09Write_8cyc_1LD_1FSU],              (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>;
++def : InstRW<[HIP09Write_8cyc_1LD_1FSU],                        (instregex "^LD2Rv(16b|8h|4s|2d)$$")>;
++def : InstRW<[WriteAdr, HIP09Write_8cyc_1LD_1FSU],              (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>;
++
++// SIMD load, 3-element, single, replicate to all lanes
++def : InstRW<[HIP09Write_8cyc_2LD_2FSU],                        (instregex "^LD3Rv(8b|4h|2s|1d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_8cyc_2LD_2FSU],              (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>;
++def : InstRW<[HIP09Write_8cyc_2LD_2FSU],                        (instregex "^LD3Rv(16b|8h|4s|2d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_8cyc_2LD_2FSU],              (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>;
++
++// SIMD load, 4-element, single, replicate to all lanes
++def : InstRW<[HIP09Write_8cyc_2LD_2FSU],                        (instregex "^LD4Rv(8b|4h|2s|1d)$$")>;
++def : InstRW<[WriteAdr, HIP09Write_8cyc_2LD_2FSU],              (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>;
++def : InstRW<[HIP09Write_8cyc_2LD_2FSU],                        (instregex "^LD4Rv(16b|8h|4s|2d)$$")>;
++def : InstRW<[WriteAdr, HIP09Write_8cyc_2LD_2FSU],              (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>;
++
++// ASIMD Store Instructions
++// -----------------------------------------------------------------------------
++
++// SIMD store, 1-element, multiple, 1 reg, Q-form
++def : InstRW<[HIP09Write_2cyc_1ST_1STD],                        (instregex "ST1Onev(16b|8h|4s|2d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_2cyc_1ST_1STD],              (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
++
++// SIMD store, 1-element, multiple, 2 reg, Q-form
++def : InstRW<[HIP09Write_3cyc_2ST_2STD],                        (instregex "ST1Twov(16b|8h|4s|2d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_3cyc_2ST_2STD],              (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
++
++// SIMD store, 1-element, multiple, 3 reg, Q-form
++def : InstRW<[HIP09Write_4cyc_3ST_3STD],                        (instregex "ST1Threev(16b|8h|4s|2d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_4cyc_3ST_3STD],              (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
++
++// SIMD store, 1-element, multiple, 4 reg, Q-form
++def : InstRW<[HIP09Write_5cyc_4ST_4STD],                        (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_5cyc_4ST_4STD],              (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
++
++// SIMD store ST1 (multiple structure) Q=0, n=1/2
++def : InstRW<[HIP09Write_2cyc_1ST_1STD],                        (instregex "ST1(One|Two)v(8b|4h|2s|1d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_2cyc_1ST_1STD],              (instregex "ST1(One|Two)v(8b|4h|2s|1d)_POST$")>;
++
++// SIMD store ST1 (multiple structure) Q=0, n=3/4
++def : InstRW<[HIP09Write_3cyc_2ST_2STD],                        (instregex "ST1(Three|Four)v(8b|4h|2s|1d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_3cyc_2ST_2STD],              (instregex "ST1(Three|Four)v(8b|4h|2s|1d)_POST$")>;
++
++// SIMD store, 1-element, single, 1 lane
++// SIMD store, 2-element, single, 1 lane
++def : InstRW<[HIP09Write_1cyc_1ST_1STD],                        (instregex "ST[12]i(8|16|32|64)$")>;
++def : InstRW<[WriteAdr, HIP09Write_1cyc_1ST_1STD],              (instregex "ST[12]i(8|16|32|64)_POST$")>;
++
++// SIMD store, 2-element, multiple, Q-form
++def : InstRW<[HIP09Write_1cyc_2ST_2STD],                        (instregex "ST2Twov(16b|8h|4s|2d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_1cyc_2ST_2STD],              (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
++
++// SIMD store, 2-element, multiple, D-form
++def : InstRW<[HIP09Write_1cyc_1ST_1STD],                        (instregex "ST2Twov(8b|4h|2s)$")>;
++def : InstRW<[WriteAdr, HIP09Write_1cyc_1ST_1STD],              (instregex "ST2Twov(8b|4h|2s)_POST$")>;
++
++// SIMD store, 3-element, multiple, Q-form
++def : InstRW<[HIP09Write_7cyc_3FSU_3ST_3STD],                   (instregex "ST3Threev(16b|8h|4s|2d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_7cyc_3FSU_3ST_3STD],         (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>;
++
++// SIMD store, 3-element, multiple, D-form
++def : InstRW<[HIP09Write_6cyc_2FSU_2ST_2STD],                   (instregex "ST3Threev(8b|4h|2s)$")>;
++def : InstRW<[WriteAdr, HIP09Write_6cyc_2FSU_2ST_2STD],         (instregex "ST3Threev(8b|4h|2s)_POST$")>;
++
++// SIMD store, 4-element, multiple, Q-form
++def : InstRW<[HIP09Write_10cyc_8FSU_8ST_8STD],                  (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
++def : InstRW<[WriteAdr, HIP09Write_10cyc_8FSU_8ST_8STD],        (instregex "ST4Fourv(16b|8h|4s|2d)_POST$")>;
++
++// SIMD store, 4-element, multiple, D-form
++def : InstRW<[HIP09Write_10cyc_6FSU_6ST_6STD],                  (instregex "ST4Fourv(8b|4h|2s)$")>;
++def : InstRW<[WriteAdr, HIP09Write_10cyc_6FSU_6ST_6STD],        (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
++
++// SIMD store, 3-element, single, 1 lane
++// SIMD store, 4-element, single, 1 lane
++def : InstRW<[HIP09Write_4cyc_1FSU_1ST_1STD],                   (instregex "ST[34]i(8|16|32|64)$")>;
++def : InstRW<[WriteAdr, HIP09Write_4cyc_1FSU_1ST_1STD],         (instregex "ST[34]i(8|16|32|64)_POST$")>;
++
++// Cryptography Extensions v8.0
++// -----------------------------------------------------------------------------
++
++// Crypto AES ops
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^AES[DE]rr$", "^AESI?MCrr$")>;
++
++// Crypto polynomial (64x64) multiply long
++def : InstRW<[HIP09Write_2cyc_1FSU2],                           (instregex "^PMULL(v1|v2)i64$")>;
++
++// Crypto SHA1 hash acceleration ops
++// Crypto SHA1 schedule acceleration ops
++def : InstRW<[HIP09Write_2cyc_1FSU2],                           (instregex "^SHA1(H|SU0|SU1)")>;
++
++// Crypto SHA1 hash acceleration ops
++def : InstRW<[HIP09Write_4cyc_1FSU2],                           (instregex "^SHA1[CMP]")>;
++
++// Crypto SHA256 schedule acceleration ops
++def : InstRW<[HIP09Write_2cyc_1FSU2],                           (instregex "^SHA256SU[01]")>;
++
++// Crypto SHA256 hash acceleration ops
++def : InstRW<[HIP09Write_4cyc_1FSU2],                           (instregex "^SHA256H2?rrr")>;
++
++// Cryptography Extensions v8.2
++// -----------------------------------------------------------------------------
++// v8.2 SHA512 hash acceleration ops
++def : InstRW<[HIP09Write_2cyc_1FSU2],                           (instregex "^SHA512(H|H2|SU0|SU1)")>;
++
++// v8.2 SHA3 ops
++def : InstRW<[HIP09Write_1cyc_1FSU2],                           (instrs BCAX, EOR3, RAX1, XAR)>;
++
++// v8.2 SM/SM3 ops
++def : InstRW<[HIP09Write_2cyc_1FSU2],                           (instregex "^SM3SS1$", "^SM3TT[12][AB]$" ,
++                                                                                "^SM3PARTW[12]$")>;
++
++// v8.2 SM/SM4 ops
++def : InstRW<[HIP09Write_4cyc_1FSU2],                           (instregex "^SM4E(NCKEY)?$")>;
++
++// CRC
++// -----------------------------------------------------------------------------
++
++// CRC checksum ops
++def : InstRW<[HIP09Write_2cyc_1ALUM],                           (instregex "^CRC32C?[BHWX]rr$")>;
++
++// 3.22 SVE Predicate instructions
++// -----------------------------------------------------------------------------
++
++// Loop control, based on predicate
++// Loop control, based on predicate and flag setting
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^BRK[AB]S?_PPzP", "^BRK[AB]_PPmP")>;
++
++// Loop control, propagating
++// Loop control, propagating and flag setting
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^BRKNS?_PPzP$")>;
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^BRKP[AB]S?_PPzPP$")>;
++
++// Loop control, based on GPR
++def : InstRW<[HIP09Write_2cyc_1ALUS01],                         (instregex "^WHILEL(E|O|S|T)_P(WW|XX)_[BHSD]$")>;
++
++// Loop terminate
++def : InstRW<[HIP09Write_1cyc_1ALUS23],                         (instregex "^CTERM(EQ|NE)_(WW|XX)$")>;
++
++// Predicate counting scalar
++def : InstRW<[HIP09Write_1cyc_1ALUS],                           (instregex "^ADD(PL|VL)_XXI$")>;
++def : InstRW<[HIP09Write_1cyc_1ALUS],                           (instregex "^(CNT|DEC|INC)[BHWD]_XPiI$")>;
++def : InstRW<[HIP09Write_1cyc_1ALUS],                           (instrs RDVLI_XI)>;
++
++// Predicate counting scalar
++def : InstRW<[HIP09Write_2cyc_1ALUS23],                         (instregex "^SQ(DEC|INC)[BHWD]_(XPiWdI|XPiI)$")>;
++def : InstRW<[HIP09Write_2cyc_1ALUS23],                         (instregex "^UQ(DEC|INC)[BHWD]_(WPiI|XPiI)$")>;
++
++// Predicate counting scalar, active predicate
++def : InstRW<[HIP09Write_5cyc_1FSU02_1ALUS],                    (instregex "^(INCP|DECP)_XP_[BHSD]$")>;
++
++// Predicate counting scalar, active predicate
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^CNTP_XPP_[BHSD]$")>;
++
++// Predicate counting vector, active predicate
++def : InstRW<[HIP09Write_6cyc_1FSU02_1ALUS],                    (instregex "^SQ(INCP|DECP)_XPWd_[BHSD]$",
++                                                                                "^(SQ|UQ)(INCP|DECP)_[XW]P_[BHSD]$")>;
++
++// Predicate counting vector, active predicate
++def : InstRW<[HIP09Write_4cyc_2FSU02],                          (instregex "^(SQ|UQ)?(INCP|DECP)_ZP_[HSD]$")>;
++
++// Predicate logical
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^(ORR|EOR|AND|BIC|NOT)_ZPmZ_[BHSD]$")>;
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^(AND|ORR|EOR|BIC|NAND|NOR|ORN)_PPzPP$")>;
++
++// Predicate logical, flag setting
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^(AND|BIC|EOR|ORR|ORN|NOR|NAND)S_PPzPP$")>;
++
++// Predicate reverse
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^REV_PP_[BHSD]$")>;
++
++// Predicate select
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^SEL_ZPZZ_[BHSD]$")>;
++
++// Predicate set
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instrs PFALSE)>;
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^PTRUE_[BHSD]$")>;
++
++// Predicate set/initialize, set flags
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^PTRUES_[BHSD]$")>;
++
++// Predicate find first/next
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^PFIRST_B$",
++                                                                                "^PNEXT_[BHSD]$")>;
++
++// Predicate test
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^PTEST_PP")>;
++
++// Predicate transpose
++// Predicate zip/unzip
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^(ZIP|UZP|TRN)[12]_PPP_[BHSD]$")>;
++
++// Predicate unpack and widen
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^(PUNPKHI|PUNPKLO)_PP$")>;
++
++
++// 3.23 SVE Integer Instructions
++// -----------------------------------------------------------------------------
++
++// Arithmetic, absolute diff SABD, UABD
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^[SU]ABD_ZPmZ_[BHSD]$")>;
++
++// Arithmetic, address generation
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^ADR_[SU]XTW_ZZZ_D_[0123]$", "^ADR_LSL_ZZZ_[SD]_[0123]$")>;
++
++// Arithmetic, basic
++// Arithmetic, complex
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^(ABS|ADD|SUBR?|NEG|CNOT)_ZPmZ")>;
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^(SQ|UQ)?(ADD|SUBR?)_(ZZZ|ZI)_[BHSD]$")>;
++
++// Arithmetic, shift
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^(ASR|LSR|LSL)_ZPmI_[BHSD]$")>;
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^(ASR|LSR|LSL)_ZZI_[BHSD]$")>;
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^(ASR|LSR|LSL)_ZPZ[IZ]")>;
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^(ASR|LSR|LSL)R?_ZPmZ_[BHSD]")>;
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^(ASR|LSR|LSL)_WIDE_(ZPmZ|ZZZ)_[BHS]")>;
++
++// Arithmetic, shift right for divide
++def : InstRW<[HIP09Write_3cyc_1FSU02],                          (instregex "^ASRD_ZPmI")>;
++
++// Count/reverse bits
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^(CLS|CLZ|RBIT)_ZPmZ_[BHSD]$")>;
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^(CLS|CLZ)_ZPmZ_[BHSD]_UNDEF$")>;
++
++// Count/reverse bits, B H S form
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^CNT_ZPmZ_[BHS]$")>;
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^CNT_ZPmZ_[BHS]_UNDEF$")>;
++
++// Count/reverse bits, D form
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^CNT_ZPmZ_D$")>;
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^CNT_ZPmZ_D_UNDEF$")>;
++
++// Broadcast logical bitmask immediate to vector
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^DUPM_ZI$")>;
++
++// Compare and set flags
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^CMP(GE|GT|LT|LE|HS|HI|LO|LS|EQ|NE)_PPzZ[ZI]_[BHSD]$")>;
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^CMP(GE|GT|LT|LE|HS|HI|LO|LS|EQ|NE)_WIDE_PPzZZ_[BHS]$")>;
++
++// Conditional extract operations, scalar form
++def : InstRW<[HIP09Write_5cyc_1FSU02_1ALUS23],                  (instregex "^CLAST[AB]_RPZ_[BHSD]$")>;
++
++// Conditional extract operations, SIMD&FP scalar and vector forms
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^CLAST[AB]_[VZ]PZ_[BHSD]$")>;
++
++// Conditional extract operations, SIMD&FP scalar and vector forms
++def : InstRW<[HIP09Write_7cyc_1FSU02_3RC],                      (instregex "^SPLICE_ZPZZ?_[BHSD]$")>;
++def : InstRW<[HIP09Write_5cyc_1FSU02],                          (instregex "^COMPACT_ZPZ_[SD]$")>;
++
++// Convert to floating point
++def : InstRW<[HIP09Write_3cyc_1FSU02],                          (instregex "^[SU]CVTF_ZPmZ_(HtoH|StoS|StoD|StoH|DtoS|DtoH|DtoD)$")>;
++
++// SVE copy general register to vector (predicated)
++def : InstRW<[HIP09Write_5cyc_1FSU02_1ALUS23],                  (instregex "^CPY_ZPmV_[BHSD]$")>;
++
++// SVE copy integer immediate (predicated)
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^CPY_(ZPmI|ZPzI)_[BHSD]$")>;
++
++// SVE copy element from SIMD&FP scalar register
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^CPY_ZPmR_[BHSD]$")>;
++
++// SVE integer divide vectors (predicated) 32-bit
++def : InstRW<[HIP09Write_17cyc_1FSU02_13RC],                    (instregex "^[SU](DIV)R?_ZPmZ_S$")>;
++
++// SVE integer divide vectors (predicated) 64-bit
++def : InstRW<[HIP09Write_17cyc_1FSU02_13RC],                    (instregex "^[SU](DIV)R?_ZPmZ_D$")>;
++
++// Dot product, 8-bit
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^[SU]DOT_ZZZI?_S$")>;
++
++// Dot product, 16-bit
++def : InstRW<[HIP09Write_3cyc_2FSU02],                          (instregex "^[SU]DOT_ZZZI?_D$")>;
++
++// Dot product, 16-bit, using signed and unsigned integers
++def : InstRW<[HIP09Write_3cyc_2FSU02],                          (instregex "^(SU|US)DOT_ZZZI?$")>;
++
++// Duplicate, indexed
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^DUP_ZZI_[BHSDQ]$")>;
++
++// Duplicate, immediate
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^DUP_ZI_[BHSD]$")>;
++
++// Duplicate, scalar
++def : InstRW<[HIP09Write_5cyc_1FSU02_1ALUS23],                  (instregex "^DUP_ZR_[BHSD]$")>;
++
++// Extend, sign or zero
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^[SU]XTB_ZPmZ_[HSD]$")>;
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^[SU]XTH_ZPmZ_[SD]$")>;
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^[SU]XTW_ZPmZ_D$")>;
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^[SU]XTB_ZPmZ_[HSD]_UNDEF$")>;
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^[SU]XTH_ZPmZ_[SD]_UNDEF$")>;
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^[SU]XTW_ZPmZ_D_UNDEF$")>;
++
++// Extract
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^EXT_ZZI(_B)?$")>;
++
++// Insert operation, scalar
++def : InstRW<[HIP09Write_5cyc_1FSU02_1ALUS23],                  (instregex "^INSR_ZR_[BHSD]$")>;
++
++// Insert operation, SIMD and FP scalar
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^INSR_ZV_[BHSD]$")>;
++
++// Extract operation, SIMD and FP scalar
++// Extract operation, scalar
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^LAST[AB]_[RV]PZ_[BHSD]$")>;
++
++// Horizontal operations, B-form, immediate operands only
++def : InstRW<[HIP09Write_2cyc_2FSU02],                          (instregex "^INDEX_II_B$")>;
++
++// Horizontal operations, H,S,D-form, immediate operands only
++def : InstRW<[HIP09Write_3cyc_2FSU02],                          (instregex "^INDEX_II_[HSD]$")>;
++
++// Horizontal operations, B-form, scalar start, immediate increment
++def : InstRW<[HIP09Write_8cyc_6FSU02_6ALUS23],                  (instregex "^INDEX_RI_B$")>;
++
++// Horizontal operations, H,S,D-form, scalar start, immediate increment
++def : InstRW<[HIP09Write_9cyc_6FSU02_6ALUS23],                  (instregex "^INDEX_RI_[HSD]$")>;
++
++// Horizontal operations, B-form, immediate start, scalar increment
++def : InstRW<[HIP09Write_6cyc_4FSU02_4ALUS23],                  (instregex "^INDEX_IR_B$")>;
++
++// Horizontal operations, H,S,D-form, immediate start, scalar increment
++def : InstRW<[HIP09Write_7cyc_4FSU02_4ALUS23],                  (instregex "^INDEX_IR_[HSD]$")>;
++
++// Horizontal operations, B-form, scalar
++def : InstRW<[HIP09Write_6cyc_6FSU02_6ALUS23],                  (instregex "^INDEX_RR_B$")>;
++
++// Horizontal operations, H,S,D-form, scalar
++def : InstRW<[HIP09Write_7cyc_6FSU02_6ALUS23],                  (instregex "^INDEX_RR_[HSD]$")>;
++
++// Logical
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^(AND|ORR|EOR|BIC)_(ZZZ|ZI)$")>;
++
++// Max/min, basic and pairwise
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^[SU](MAX|MIN)_(ZPmZ|ZI)_[BHSD]$")>;
++
++// Matrix multiply-accumulate
++def : InstRW<[HIP09Write_4cyc_1FSU02_4RC],                      (instregex "^(S|U|US)MMLA_ZZZ$")>;
++
++// Move prefix
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^MOVPRFX")>;
++
++// Multiply, B element size
++def : InstRW<[HIP09Write_2cyc_2FSU02],                          (instregex "^MUL_ZI_B$",
++                                                                                "^(MUL|SMULH|UMULH)_ZPmZ_B$")>;
++
++// Multiply, H, S, D element size
++def : InstRW<[HIP09Write_3cyc_2FSU02],                          (instregex "^MUL_ZI_[HSD]$",
++                                                                                "^(MUL|SMULH|UMULH)_ZPmZ_[HSD]$")>;
++
++// Multiply accumulate, B element size
++def : InstRW<[HIP09Write_2cyc_2FSU02],                          (instregex "^(MLA|MLS|MAD|MSB)_ZPmZZ_B$")>;
++
++// Multiply accumulate, H, S, D element size
++def : InstRW<[HIP09Write_3cyc_2FSU02],                          (instregex "^(MLA|MLS|MAD|MSB)_ZPmZZ_[HSD]$",
++                                                                                "^(MLA|MLS)_ZZZI_[HSD]$")>;
++
++// Predicate counting vector
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^(SQ|UQ)?(DEC|INC)[HWD]_ZPiI$")>;
++
++// Reduction, arithmetic
++def : InstRW<[HIP09Write_3cyc_1FSU02],                          (instregex "^[SU]ADDV_VPZ_[BHSD]$")>;
++
++// Reduction, arithmetic, B H element size
++def : InstRW<[HIP09Write_4cyc_1FSU02],                          (instregex "^[SU](MAX|MIN)V_VPZ_D$")>;
++
++// Reduction, arithmetic, S D element size
++def : InstRW<[HIP09Write_3cyc_1FSU02],                          (instregex "^[SU](MAX|MIN)V_VPZ_[BHS]$")>;
++
++// Reduction, logical
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^(AND|EOR|OR)V_VPZ_[BHSD]$")>;
++
++// Reverse, vector
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^REV_ZZ_[BHSD]$")>;
++
++// Reverse within elements
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^REV[BHW]_ZPmZ_[HSD]$")>;
++
++// Select, vector form
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^SEL_PPPP$")>;
++
++// Table lookup
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^TBL_ZZZ_[BHSD]$")>;
++
++// Transpose, vector form
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^TRN[12]_ZZZ_[BHSDQ]$")>;
++
++// Unpack and extend
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]$")>;
++
++// Zip/unzip
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]$")>;
++
++// 3.24 SVE Floating-point Instructions
++// -----------------------------------------------------------------------------
++
++// Floating point absolute value
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^FABS_ZPmZ_[HSD]$")>;
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^FABS_ZPmZ_[HSD]_UNDEF$")>;
++
++// Floating point negative value
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^FNEG_ZPmZ_[HSD]$")>;
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^FNEG_ZPmZ_[HSD]_UNDEF$")>;
++
++// Floating point absolute difference
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^FABD_ZPmZ_[HSD]$")>;
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^FABD_ZPZZ")>;
++
++// Floating point arithmetic
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^F(ADD|SUB|SUBR)_ZPm[IZ]_[HSD]$")>;
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^F(ADD|SUB)_ZZZ_[HSD]$")>;
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^F(ADD|SUB|SUBR)_ZPZ[IZ]")>;
++
++
++// Floating point compare
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^FACG[ET]_PPzZZ_[HSD]$",
++                                                                                "^FCM(GE|GT|EQ|NE|UO)_PPzZZ_[HSD]$",
++                                                                                "^FCM(GE|GT|LT|LE|EQ|NE)_PPzZ0_[HSD]$")>;
++
++// Floating point complex add
++def : InstRW<[HIP09Write_3cyc_1FSU02],                          (instregex "^FCADD_ZPmZ_[HSD]$")>;
++
++// Floating point complex multiply add
++def : InstRW<[HIP09Write_5cyc_1FSU02],                          (instregex "^FCMLA_ZPmZZ_[HSD]$",
++                                                                                "^FCMLA_ZZZI_[HS]$")>;
++
++// Floating point convert, long or narrow
++def : InstRW<[HIP09Write_3cyc_2FSU02],                          (instregex "^FCVT_ZPmZ")>;
++
++// Floating point convert to integer
++def : InstRW<[HIP09Write_3cyc_2FSU02],                          (instregex "^FCVTZ[SU]_ZPmZ")>;
++
++// Floating point copy
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^FCPY_ZPmI_[HSD]$", "^FDUP_ZI_[HSD]$")>;
++
++// Floating point divide, F16 / f32
++def : InstRW<[HIP09Write_13cyc_1FSU02_9RC],                     (instregex "^FDIVR?_ZPmZ_[HS]$")>;
++def : InstRW<[HIP09Write_13cyc_1FSU02_9RC],                     (instregex "^FDIVR?_ZPZZ_[HS]_(UNDEF|ZERO)$")>;
++
++// Floating point divide, F64
++def : InstRW<[HIP09Write_15cyc_1FSU02_11RC],                    (instregex "^FDIVR?_ZPmZ_D$")>;
++def : InstRW<[HIP09Write_15cyc_1FSU02_11RC],                    (instregex "^FDIVR?_ZPZZ_D_(UNDEF|ZERO)$")>;
++
++// Floating point min/max
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^F(MAX|MIN)(NM)?_ZPm[ZI]_[HSD]$")>;
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^F(MAX|MIN)(NM)?_ZPZ[ZI]")>;
++
++// Floating point multiply
++def : InstRW<[HIP09Write_3cyc_1FSU02],                          (instregex "^FMUL_(ZPmI|ZPmZ|ZZZI?)_[HSD]$")>;
++def : InstRW<[HIP09Write_3cyc_1FSU02],                          (instregex "^FMUL_ZPZ[ZI]")>;
++
++// Floating point multiply
++def : InstRW<[HIP09Write_3cyc_1FSU02],                          (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]$")>;
++def : InstRW<[HIP09Write_3cyc_1FSU02],                          (instregex "^FMULX_ZPZZ")>;
++
++// Floating point multiply accumulate
++def : InstRW<[HIP09Write_4cyc_1FSU02],                          (instregex "^FN?(MLA|MLS|MAD|MSB)_ZPmZZ_[HSD]$")>;
++def : InstRW<[HIP09Write_4cyc_1FSU02],                          (instregex "^FML[AS]_ZZZI_[HSD]$")>;
++def : InstRW<[HIP09Write_4cyc_1FSU02],                          (instregex "^FN?ML[AS]_ZPZZZ_[HSD]_UNDEF$")>;
++
++// Floating point reciprocal estimate
++def : InstRW<[HIP09Write_2cyc_2FSU02],                          (instregex "^FR(ECPE|SQRTE)_ZZ_[HSD]$")>;
++def : InstRW<[HIP09Write_2cyc_2FSU02],                          (instregex "^FRECPX_ZPmZ_[HSD]$")>;
++def : InstRW<[HIP09Write_2cyc_2FSU02],                          (instregex "^FRECPX_ZPmZ_[HSD]_UNDEF$")>;
++
++// Floating point reciprocal step
++def : InstRW<[HIP09Write_4cyc_1FSU02],                          (instregex "^FR(ECPS|SQRTS)_ZZZ_[HSD]$")>;
++
++// Floating point reduction, F16
++def : InstRW<[HIP09Write_12cyc_1FSU02_4RC],                     (instregex "^FADDV_VPZ_H$")>;
++
++// Floating point reduction, F32
++def : InstRW<[HIP09Write_9cyc_1FSU02_3RC],                      (instregex "^FADDV_VPZ_S$")>;
++
++// Floating point reduction, F64
++def : InstRW<[HIP09Write_6cyc_2FSU02],                          (instregex "^FADDV_VPZ_D$")>;
++
++// Floating point reduction, F16, F32
++def : InstRW<[HIP09Write_3cyc_1FSU02],                          (instregex "^F(MAX|MIN)(NM)?V_VPZ_[HS]$")>;
++
++// Floating point reduction, F64
++def : InstRW<[HIP09Write_2cyc_1FSU02],                          (instregex "^F(MAX|MIN)(NM)?V_VPZ_D$")>;
++
++// Floating point round to integral
++def : InstRW<[HIP09Write_2cyc_2FSU02],                          (instregex "^FRINT[AMNPXZI]_ZPmZ_[HSD]$")>;
++def : InstRW<[HIP09Write_2cyc_2FSU02],                          (instregex "^FRINT[AMNPXZI]_ZPmZ_[HSD]_UNDEF$")>;
++
++// Floating point square root, F16 / F32
++def : InstRW<[HIP09Write_21cyc_1FSU02_17RC],                    (instregex "^FSQRT_ZPmZ_[HS]$")>;
++def : InstRW<[HIP09Write_21cyc_1FSU02_17RC],                    (instregex "^FSQRT_ZPmZ_[HS]_UNDEF$")>;
++
++// Floating point square root, F64
++def : InstRW<[HIP09Write_25cyc_1FSU02_21RC],                    (instregex "^FSQRT_ZPmZ_D$")>;
++def : InstRW<[HIP09Write_25cyc_1FSU02_21RC],                    (instregex "^FSQRT_ZPmZ_D_UNDEF")>;
++
++// Floating point trigonometric exponentiation
++def : InstRW<[HIP09Write_2cyc_2FSU02],                          (instregex "^FEXPA_ZZ_[HSD]$")>;
++
++// Floating point trigonometric multiply add
++def : InstRW<[HIP09Write_4cyc_1FSU02],                          (instregex "^FTMAD_ZZI_[HSD]$")>;
++
++// Floating point trigonometric, miscellaneous
++def : InstRW<[HIP09Write_4cyc_1FSU02],                          (instregex "^FTSMUL_ZZZ_[HSD]$")>;
++
++// Floating point trigonometric, miscellaneous
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^FTSSEL_ZZZ_[HSD]$")>;
++
++// Floating point associative add, F16
++def : InstRW<[HIP09Write_36cyc_1FSU02_32RC],                    (instrs FADDA_VPZ_H)>;
++
++// Floating point associative add, F32
++def : InstRW<[HIP09Write_20cyc_1FSU02_16RC],                    (instrs FADDA_VPZ_S)>;
++
++// Floating point associative add, F64
++def : InstRW<[HIP09Write_12cyc_1FSU02_8RC],                     (instrs FADDA_VPZ_D)>;
++
++// SVE BFlot16 (BF16) Instructions
++// -----------------------------------------------------------------------------
++
++// Convert, F32 to BF16
++def : InstRW<[HIP09Write_3cyc_2FSU02],                          (instregex "^BFCVT(NT)?_ZPmZ$")>;
++
++// Dot product
++def : InstRW<[HIP09Write_6cyc_4FSU],                            (instregex "^BFDOT_ZZ[ZI]$")>;
++
++// Matrix multiply accumulate
++def : InstRW<[HIP09Write_9cyc_1FSU_8RC],                        (instregex "^BFMMLA_ZZZ$")>;
++
++// Multiply accumulate long
++def : InstRW<[HIP09Write_5cyc_2FSU],                            (instregex "^BFMLAL[BT]_ZZZI?$")>;
++
++// SVE Load Instructions
++// -----------------------------------------------------------------------------
++
++// Load vector
++def : InstRW<[HIP09Write_6cyc_1LD],                             (instregex "^LDR_ZXI$")>;
++
++// Load predicate
++def : InstRW<[HIP09Write_8cyc_1LD_1FSU],                        (instregex "^LDR_PXI$")>;
++
++// Contiguous load, scalar + imm
++def : InstRW<[HIP09Write_6cyc_1LD],                             (instregex "^LD1(B|H|W|D)_IMM_REAL",
++                                                                                "^LD1(B|H|W|SB|SH|SW)_[HSD]_IMM_REAL")>;
++
++// Contiguous load, scalar + scalar
++def : InstRW<[HIP09Write_6cyc_1LD],                             (instregex "^LD1(B|H|W|D|SB|SH|SW)(_[HSD])?$")>;
++
++// Contiguous load broadcast, scalar + imm
++def : InstRW<[HIP09Write_8cyc_2LD_2FSU02],                      (instregex "^LD1R(B|H|W|D|SB|SH|SW|Q)_IMM$",
++                                                                                "^LD1R(B|H|W|D|SB|SH|SW|Q)_[BHSWD]_IMM$")>;
++
++// Contiguous load broadcast, scalar + scalar
++def : InstRW<[HIP09Write_8cyc_2LD_2FSU02],                      (instregex "^LD1RQ_[BHWD]$")>;
++
++// Non-temporal load, scalar + imm
++def : InstRW<[HIP09Write_6cyc_1LD],                             (instregex "^LDNT1[BHWD]_ZRI$")>;
++
++// Non-temporal load, scalar + scalar
++def : InstRW<[HIP09Write_6cyc_1LD],                             (instregex "^LDNT1[BHWD]_ZRR$")>;
++
++// Contiguous first faulting load, scalar + scalar
++def : InstRW<[HIP09Write_6cyc_1LD],                             (instregex "^LDFF1(B|H|W|D|SB|SH|SW)_REAL$",
++                                                                                "^LDFF1(B|H|W|D|SB|SH|SW)_[HSD]_REAL$")>;
++
++// Contiguous non-faulting load, scalar + imm
++def : InstRW<[HIP09Write_6cyc_1LD],                             (instregex "^LDNF1(B|H|W|D|SB|SH|SW)_IMM",
++                                                                                "^LDNF1(B|H|W|D|SB|SH|SW)_[HSD]_IMM")>;
++
++// Contiguous Load two structures to two vectors, scalar + imm
++// Contiguous Load two structures to two vectors, scalar + scalar
++def : InstRW<[HIP09Write_9cyc_4LD_4FSU02],                      (instregex "^LD2[BHWD](_IMM)?$")>;
++
++// Contiguous Load three structures to two vectors, scalar + imm
++// Contiguous Load three structures to two vectors, scalar + scalar
++def : InstRW<[HIP09Write_11cyc_6LD_6FSU02],                     (instregex "^LD3[BHWD](_IMM)?$")>;
++
++// Contiguous Load four structures to two vectors, scalar + imm
++// Contiguous Load four structures to two vectors, scalar + scalar
++def : InstRW<[HIP09Write_16cyc_16LD_16FSU02],                   (instregex "^LD4[BHWD](_IMM)?$")>;
++
++// Gather load, vector + imm, 32- bit element size
++def : InstRW<[HIP09Write_18cyc_1LD_4RC],                        (instregex "^GLD(FF)?1S?[BH]_S_(IMM|[SU]XTW)(_REAL)?$",
++                                                                                "^GLD(FF)?1W_(IMM|[SU]XTW)(_REAL)?")>;
++
++// Gather load, vector + imm, 64- bit element size
++def : InstRW<[HIP09Write_16cyc_1LD_4RC],                        (instregex "^GLD(FF)?1S?[BHW]_D_(IMM|REAL|SCALED)",
++                                                                                "^GLD(FF)?1D_(IMM|REAL|SCALED)")>;
++
++// Gather load, 32-bit scaled offset
++def : InstRW<[HIP09Write_18cyc_1LD_4RC],                        (instregex "^GLD(FF)?1S?[HW]_S_[SU]XTW_SCALED(_REAL)?$")>;
++
++// Gather load, 32-bit unpacked unscaled offset
++def : InstRW<[HIP09Write_18cyc_1LD_4RC],                        (instregex "^GLD(FF)?1S?[BHW]_D_[SU]XTW",
++                                                                                "^GLD(FF)?1D_[SU]XTW")>;
++
++// Prefetch
++def : InstRW<[HIP09Write_18cyc_1LD_4RC],                        (instregex "^PRF[BHWD]_PRI")>;
++def : InstRW<[HIP09Write_18cyc_1LD_4RC],                        (instregex "^PRF[BHWD]_PRR")>;
++def : InstRW<[HIP09Write_18cyc_1LD_4RC],                        (instregex "^PRF[BHW]_[SD]")>;
++def : InstRW<[HIP09Write_18cyc_1LD_4RC],                        (instregex "^PRFD_[SD]")>;
++
++// SVE Store Instructions
++// -----------------------------------------------------------------------------
++
++// Store from predicate reg
++def : InstRW<[HIP09Write_3cyc_1FSU02_1ST_1STD],                 (instregex "^STR_PXI$")>;
++
++// Store from vector reg
++def : InstRW<[HIP09Write_2cyc_2ST_2STD],                        (instregex "^STR_ZXI$")>;
++
++// SVE contiguous store (scalar plus immediate)
++def : InstRW<[HIP09Write_2cyc_2ST_2STD],                        (instregex "^ST1[BHWD]_IMM$",
++                                                                                "^ST1B_[HSD]_IMM$",
++                                                                                "^ST1H_[SD]_IMM$",
++                                                                                "^ST1W_D_IMM$")>;
++
++// SVE contiguous store (scalar plus scalar)
++def : InstRW<[HIP09Write_2cyc_2ST_2STD],                        (instregex "^ST1[BHWD]$",
++                                                                                "^ST1B_[HSD]$",
++                                                                                "^ST1H_[SD]$",
++                                                                                "^ST1W_D$")>;
++
++// Contiguous store two structures from two vectors
++def : InstRW<[HIP09Write_6cyc_3FSU02_3ST_3STD],                 (instregex "^ST2[BHWD](_IMM)?$")>;
++
++// Contiguous store three structures from three vectors
++def : InstRW<[HIP09Write_6cyc_4FSU02_4ST_4STD],                 (instregex "^ST3[BHWD](_IMM)?$")>;
++
++// Contiguous store four structures from four vectors
++def : InstRW<[HIP09Write_8cyc_16FSU02_16ST_16STD],              (instregex "^ST4[BHWD](_IMM)?$")>;
++
++// non-tenporal store, scalar + imm
++def : InstRW<[HIP09Write_2cyc_2ST_2STD],                        (instregex "^STNT1[BHWD]_ZRI$")>;
++
++// Non-temporal store, scalar + scala
++def : InstRW<[HIP09Write_2cyc_2ST_2STD],                        (instregex "^STNT1[BHWD]_ZRR$")>;
++
++// Scatter store vector + imm 32-bit element size
++def : InstRW<[HIP09Write_4cyc_8ST_8STD],                        (instregex "^SST1[BH]_S_IMM$",
++                                                                                "^SST1W_IMM$")>;
++
++// Scatter store vector + imm 64-bit element size
++def : InstRW<[HIP09Write_2cyc_4ST_4STD],                        (instregex "^SST1[BHW]_D_IMM$",
++                                                                                "^SST1D_IMM$")>;
++
++// Scatter store, 32-bit scaled offset
++def : InstRW<[HIP09Write_4cyc_8ST_8STD],                        (instregex "^SST1H_S_[SU]XTW_SCALED$",
++                                                                                "^SST1W_[SU]XTW_SCALED$")>;
++
++// Scatter store, 32-bit unpacked unscaled offset
++def : InstRW<[HIP09Write_4cyc_8ST_8STD],                        (instregex "^SST1[BHW]_D_[SU]XTW$",
++                                                                                "^SST1D_[SU]XTW$")>;
++
++// Scatter store, 32-bit unpacked scaled offset
++def : InstRW<[HIP09Write_4cyc_8ST_8STD],                        (instregex "^SST1[HW]_D_[SU]XTW_SCALED$",
++                                                                                "^SST1D_[SU]XTW_SCALED$")>;
++
++// Scatter store, 32-bit unscaled offset
++def : InstRW<[HIP09Write_4cyc_8ST_8STD],                        (instregex "^SST1[BH]_S_[SU]XTW$",
++                                                                                "^SST1W_[SU]XTW$")>;
++
++// Scatter store, 64-bit scaled offset
++def : InstRW<[HIP09Write_2cyc_4ST_4STD],                        (instregex "^SST1[HW]_D_SCALED",
++                                                                                "^SST1D_SCALED")>;
++
++// Scatter store, 64-bit unscaled offset
++def : InstRW<[HIP09Write_2cyc_4ST_4STD],                        (instregex "^SST1[BHW]_D$",
++                                                                                "^SST1D$")>;
++
++// SVE Miscellaneous Instructions
++// -----------------------------------------------------------------------------
++
++// Read first fault register, unpredicated
++// Read first fault register, predicated
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^RDFFR_P(Pz)?_REAL$")>;
++
++// Read first fault register and set flags
++def : InstRW<[HIP09Write_1cyc_1FSU02],                          (instregex "^RDFFRS_PPz$")>;
++
++// Set first fault register
++def : InstRW<[HIP09Write_0cyc],                                 (instregex "^SETFFR$")>;
++
++// Write to first fault register
++def : InstRW<[HIP09Write_9cyc_18FSU02_9ALUM1],                  (instrs WRFFR)>;
++
++
++// -----------------------------------------------------------------------------
++} // SchedModel = HIP09Model
++
+-- 
+2.43.0
+
diff --git a/cmake-17.0.6.src.tar.xz b/cmake-17.0.6.src.tar.xz
new file mode 100644
index 0000000..aeec995
Binary files /dev/null and b/cmake-17.0.6.src.tar.xz differ
diff --git a/llvm-17.0.6.src.tar.xz b/llvm-17.0.6.src.tar.xz
new file mode 100644
index 0000000..8489442
Binary files /dev/null and b/llvm-17.0.6.src.tar.xz differ
diff --git a/llvm.spec b/llvm.spec
new file mode 100644
index 0000000..7de81b3
--- /dev/null
+++ b/llvm.spec
@@ -0,0 +1,581 @@
+%bcond_without sys_llvm
+%bcond_without check
+%bcond_with classic_flang
+%bcond_with toolchain_clang
+%bcond_without bisheng_autotuner
+%bcond_without ACPO
+
+%if %{with toolchain_clang}
+%global toolchain clang
+%endif
+
+%global maj_ver 17
+%global min_ver 0
+%global patch_ver 6
+
+%if %{with sys_llvm}
+%global pkg_name llvm
+%global install_prefix %{_prefix}
+%global install_datadir %{_datadir}
+%else
+%global pkg_name llvm%{maj_ver}
+%global install_prefix %{_libdir}/%{name}
+%global install_datadir %{install_prefix}/share
+%endif
+
+%global install_bindir %{install_prefix}/bin
+%global install_includedir %{install_prefix}/include
+%if 0%{?__isa_bits} == 64
+%global install_libdir %{install_prefix}/lib64
+%else
+%global install_libdir %{install_prefix}/lib
+%endif
+%global install_srcdir %{install_prefix}/src
+%global install_docdir %{install_prefix}/share/doc
+
+%global max_link_jobs %{_smp_build_ncpus}
+%global targets_to_build "all"
+%global experimental_targets_to_build ""
+
+%global build_install_prefix %{buildroot}%{install_prefix}
+%global llvm_triple %{_host}
+
+# Disable LTO as this causes crash if gcc lto enabled.
+%define _lto_cflags %{nil}
+
+Name:		%{pkg_name}
+Version:	%{maj_ver}.%{min_ver}.%{patch_ver}
+Release:	28
+Summary:	The Low Level Virtual Machine
+
+License:	NCSA
+URL:		http://llvm.org
+Source0:	https://github.com/llvm/llvm-project/releases/download/llvmorg-%{version}/llvm-%{version}.src.tar.xz
+Source1:	https://github.com/llvm/llvm-project/releases/download/llvmorg-%{version}/cmake-%{version}.src.tar.xz
+Source2:	https://github.com/llvm/llvm-project/releases/download/llvmorg-%{version}/third-party-%{version}.src.tar.xz
+
+# Patch{1-10} for supporting `relax` feture on LoongArch, which is consistent with !47 in openEuler repos
+Patch1: 	0001-Backport-LoongArch-Add-relax-feature-and-keep-relocations.patch
+Patch2: 	0002-Backport-LoongArch-Allow-delayed-decision-for-ADD-SUB-relocations.patch
+Patch3: 	0003-Backport-LoongArch-Emit-R_LARCH_RELAX-when-expanding-some-LoadAddress.patch
+Patch4: 	0004-Backport-MC-LoongArch-Add-AlignFragment-size-if-layout-is-available-and-not-need-insert-nops.patch
+Patch5: 	0005-Backport-LoongArch-RISCV-Support-R_LARCH_-ADD-SUB-_ULEB128-R_RISCV_-SET-SUB-_ULEB128-for-uleb128-directives.patch
+Patch6: 	0006-Backport-LoongArch-Add-relaxDwarfLineAddr-and-relaxDwarfCFA-to-handle-the-mutable-label-diff-in-dwarfinfo.patch
+Patch7: 	0007-Backport-LoongArch-Insert-nops-and-emit-align-reloc-when-handle-alignment-directive.patch
+Patch8: 	0008-Backport-test-Update-dwarf-loongarch-relocs.ll.patch
+Patch9: 	0009-Backport-MC-test-Change-ELF-uleb-ehtable.s-Mach-O-to-use-private-symbols-in-.uleb128-for-label-differences.patch
+Patch10: 	0010-Backport-Mips-MC-AttemptToFoldSymbolOffsetDifference-revert-isMicroMips-special-case.patch
+
+Patch11: 	0011-Backport-LoongArch-Add-the-support-for-vector-in-llvm17.patch
+Patch12: 	0012-Backport-LoongArch-improve-the-support-for-compiler-rt-and-bugfix.patch
+Patch13: 	0013-Backport-Bitcode-Add-some-missing-GetTypeByID-failure-checks.patch
+Patch14: 	0014-Backport-X86-Inline-Skip-inline-asm-in-inlining-targ.patch
+Patch15: 	0015-Backport-ARM-Check-all-terms-in-emitPopInst-when-clearing-Res.patch
+Patch16: 	0016-Backport-ARM-Update-IsRestored-for-LR-based-on-all-returns-82.patch
+Patch17:	0017-Add-the-support-for-classic-flang.patch
+Patch18:	0018-Fix-declaration-definition-mismatch-for-classic-flang.patch
+Patch19: 	0019-Backport-LoongArch-Improve-the-support-for-atomic-and-clear_cache.patch
+Patch20: 	0020-Update-llvm-lit-config-to-support-build_for_openeule.patch
+Patch21:	0021-Add-BiSheng-Autotuner-support-for-LLVM-compiler.patch
+Patch22:	0022-Prevent-environment-variables-from-exceeding-NAME_MA.patch
+Patch23:	0023-AArch64-Support-HiSilicon-s-HIP09-Processor.patch
+Patch24:	0024-Backport-LoongArch-fix-and-add-some-new-support.patch
+Patch25:    0025-Backport-Simple-check-to-ignore-Inline-asm-fwait-insertion.patch
+Patch26:    0026-Add-arch-restriction-for-BiSheng-Autotuner.patch
+Patch27:	0027-AArch64-Delete-hip09-macro.patch
+Patch28:	0028-backport-Clang-Fix-crash-with-fzero-call-used-regs.patch
+Patch29:	0029-SimplifyLibCalls-Merge-sqrt-into-the-power-of-exp-79.patch
+Patch30:	0030-LICM-Solve-runtime-error-caused-by-the-signal-functi.patch
+Patch31:	0031-ACPO-ACPO-Infrastructure.patch
+Patch32:	0032-ACPO-Introduce-MLInliner-using-ACPO-infrastructure.patch
+Patch33:	0033-Find-Python3-in-default-env-PATH-for-ACPO.patch
+Patch34:	0034-AArch64-Support-HiSilicon-s-HIP09-sched-model.patch
+
+BuildRequires:	binutils-devel
+BuildRequires:	cmake
+BuildRequires:	gcc
+BuildRequires:	gcc-c++
+BuildRequires:	libedit-devel
+BuildRequires:	libffi-devel
+BuildRequires:	multilib-rpm-config
+BuildRequires:	ncurses-devel
+BuildRequires:	ninja-build
+BuildRequires:	python3-devel
+BuildRequires:	python3-psutil
+BuildRequires:	python3-recommonmark
+BuildRequires:	python3-sphinx
+BuildRequires:	python3-setuptools
+BuildRequires:	zlib-devel
+%if %{with toolchain_clang}
+BuildRequires:	clang
+%endif
+
+Requires:	%{name}-libs%{?_isa} = %{version}-%{release}
+
+Provides:	llvm(major) = %{maj_ver}
+
+%description
+LLVM is a compiler infrastructure designed for compile-time, link-time,
+runtime, and idle-time optimization of programs from arbitrary programming
+languages. The compiler infrastructure includes mirror sets of programming
+tools as well as libraries with equivalent functionality.
+
+%package devel
+Summary:	Libraries and header files for LLVM
+Requires:	%{name}%{?_isa} = %{version}-%{release}
+Requires:	%{name}-libs%{?_isa} = %{version}-%{release}
+Requires:	libedit-devel
+Requires:	%{name}-static%{?_isa} = %{version}-%{release}
+
+%if %{with sys_llvm}
+Requires:	%{name}-test%{?_isa} = %{version}-%{release}
+Requires:	%{name}-googletest%{?_isa} = %{version}-%{release}
+%endif
+
+Requires(post):	%{_sbindir}/alternatives
+Requires(postun):	%{_sbindir}/alternatives
+
+Provides:	llvm-devel(major) = %{maj_ver}
+
+%description devel
+This package contains library and header files needed to develop new native
+programs that use the LLVM infrastructure.
+
+%package doc
+Summary:	Documentation for LLVM
+BuildArch:	noarch
+Requires:	%{name} = %{version}-%{release}
+Provides:	%{name}-help = %{version}-%{release}
+Obsoletes:	%{name}-help < %{version}-%{release}
+
+%description doc
+Documentation for the LLVM compiler infrastructure.
+
+%package libs
+Summary:	LLVM shared libraries
+
+%description libs
+Shared libraries for the LLVM compiler infrastructure.
+
+%package static
+Summary:	LLVM static libraries
+Conflicts:	%{name}-devel < 8
+
+Provides:	llvm-static(major) = %{maj_ver}
+
+%description static
+Static libraries for the LLVM compiler infrastructure.
+
+%package cmake-utils
+Summary: CMake shared utilities
+
+%description cmake-utils
+CMake moudules shared between LLVM projects at buid time.
+This is for internal use by LLVM packages only.
+
+%package test
+Summary:	LLVM regression tests
+Requires:	%{name}%{?_isa} = %{version}-%{release}
+Requires:	%{name}-libs%{?_isa} = %{version}-%{release}
+ 
+Provides:	llvm-test(major) = %{maj_ver}
+
+%description test
+LLVM regression tests.
+ 
+%package googletest
+Summary: LLVM's modified googletest sources
+ 
+%description googletest
+LLVM's modified googletest sources.
+
+%prep
+%setup -T -q -b 1 -n cmake-%{version}.src
+cd ..
+mv cmake-%{version}.src cmake
+%setup -T -q -b 2 -n third-party-%{version}.src
+cd ..
+mv third-party-%{version}.src third-party
+%setup -T -q -b 0 -n llvm-%{version}.src
+%autopatch -p2
+
+pathfix.py -i %{__python3} -pn \
+	test/BugPoint/compile-custom.ll.py \
+	tools/opt-viewer/*.py \
+	utils/update_cc_test_checks.py
+
+%build
+mkdir -p _build
+cd _build
+
+%if %{with ACPO}
+  echo "enable ACPO"
+  export CFLAGS="-Wp,-DENABLE_ACPO ${CFLAGS}"
+  export CXXFLAGS="-Wp,-DENABLE_ACPO ${CXXFLAGS}"
+%endif
+
+%cmake	.. -G Ninja \
+	-DBUILD_SHARED_LIBS:BOOL=OFF \
+	-DLLVM_PARALLEL_LINK_JOBS=%{max_link_jobs} \
+	-DCMAKE_BUILD_TYPE=RelWithDebInfo \
+	-DCMAKE_SKIP_RPATH:BOOL=ON \
+	-DLLVM_TARGETS_TO_BUILD=%{targets_to_build} \
+	-DLLVM_ENABLE_LIBCXX:BOOL=OFF \
+	-DLLVM_ENABLE_ZLIB:BOOL=ON \
+	-DLLVM_ENABLE_FFI:BOOL=ON \
+	-DLLVM_ENABLE_RTTI:BOOL=ON \
+	-DLLVM_USE_PERF:BOOL=ON \
+	-DLLVM_BINUTILS_INCDIR=%{_includedir} \
+	-DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD=%{experimental_targets_to_build} \
+	-DLLVM_BUILD_RUNTIME:BOOL=ON \
+	-DLLVM_INCLUDE_TOOLS:BOOL=ON \
+	-DLLVM_BUILD_TOOLS:BOOL=ON \
+	-DLLVM_INCLUDE_TESTS:BOOL=ON \
+	-DLLVM_BUILD_TESTS:BOOL=ON \
+%if %{with sys_llvm}
+	-DLLVM_INSTALL_GTEST:BOOL=ON \
+%else
+	-DLLVM_INSTALL_GTEST:BOOL=OFF \
+%endif
+	-DLLVM_LIT_ARGS=-v \
+	-DLLVM_INCLUDE_EXAMPLES:BOOL=ON \
+	-DLLVM_BUILD_EXAMPLES:BOOL=OFF \
+	-DLLVM_INCLUDE_UTILS:BOOL=ON \
+	-DLLVM_INSTALL_UTILS:BOOL=ON \
+	-DLLVM_INCLUDE_DOCS:BOOL=ON \
+	-DLLVM_BUILD_DOCS:BOOL=ON \
+	-DLLVM_ENABLE_SPHINX:BOOL=ON \
+	-DLLVM_ENABLE_DOXYGEN:BOOL=OFF \
+	-DLLVM_BUILD_LLVM_DYLIB:BOOL=ON \
+	-DLLVM_LINK_LLVM_DYLIB:BOOL=ON \
+	-DLLVM_BUILD_EXTERNAL_COMPILER_RT:BOOL=ON \
+	-DLLVM_INSTALL_TOOLCHAIN_ONLY:BOOL=OFF \
+	-DLLVM_DEFAULT_TARGET_TRIPLE=%{llvm_triple} \
+	-DSPHINX_WARNINGS_AS_ERRORS=OFF \
+	-DCMAKE_INSTALL_PREFIX=%{install_prefix} \
+	-DLLVM_INSTALL_SPHINX_HTML_DIR=%{install_docdir}/html \
+	-DSPHINX_EXECUTABLE=%{_bindir}/sphinx-build-3 \
+%if 0%{?__isa_bits} == 64
+	-DLLVM_LIBDIR_SUFFIX=64 \
+%else
+	-DLLVM_LIBDIR_SUFFIX= \
+%endif
+%if %{with classic_flang}
+        -DLLVM_ENABLE_CLASSIC_FLANG=ON \
+%endif
+%if "%{toolchain}" == "clang"
+	-DCMAKE_C_COMPILER=clang \
+	-DCMAKE_CXX_COMPILER=clang++ \
+%endif
+%if %{with bisheng_autotuner}
+	-DLLVM_ENABLE_AUTOTUNER=ON \
+%endif
+	-DLLVM_INCLUDE_BENCHMARKS=OFF
+%ninja_build LLVM
+%ninja_build
+
+%install
+%ninja_install -C _build
+
+mkdir -p %{buildroot}/%{_bindir}
+
+# Install binaries needed for lit tests
+for f in llvm-isel-fuzzer llvm-opt-fuzzer
+do
+   install -m 0755 %{_builddir}/llvm-%{version}.src/_build/bin/$f %{buildroot}%{install_bindir}
+done
+
+%if 0%{?__isa_bits} == 64
+install %{_builddir}/llvm-%{version}.src/_build/lib64/libLLVMTestingSupport.a %{buildroot}%{install_libdir}
+install %{_builddir}/llvm-%{version}.src/_build/lib64/libLLVMTestingAnnotations.a %{buildroot}%{install_libdir}
+%else
+install %{_builddir}/llvm-%{version}.src/_build/lib/libLLVMTestingSupport.a %{buildroot}%{install_libdir}
+install %{_builddir}/llvm-%{version}.src/_build/lib/libLLVMTestingAnnotations.a %{buildroot}%{install_libdir}
+%endif
+
+# Install gtest sources so clang can use them for gtest
+install -d %{buildroot}%{install_srcdir}
+install -d %{buildroot}%{install_srcdir}/utils/
+cp -R ../third-party/unittest %{buildroot}%{install_srcdir}/utils/
+ 
+# Clang needs these for running lit tests.
+cp %{_builddir}/llvm-%{version}.src/utils/update_cc_test_checks.py %{buildroot}%{install_srcdir}/utils/
+cp -R %{_builddir}/llvm-%{version}.src/utils/UpdateTestChecks %{buildroot}%{install_srcdir}/utils/
+
+# Move header files
+mkdir -p %{buildroot}/%{install_includedir}
+ln -s ../../../%{install_includedir}/llvm %{buildroot}/%{install_includedir}/llvm
+ln -s ../../../%{install_includedir}/llvm-c %{buildroot}/%{install_includedir}/llvm-c
+
+# Fix multi-lib
+%multilib_fix_c_header --file %{install_includedir}/llvm/Config/llvm-config.h
+
+# Create ld.so.conf.d entry
+mkdir -p %{buildroot}%{_sysconfdir}/ld.so.conf.d
+cat >> %{buildroot}%{_sysconfdir}/ld.so.conf.d/%{name}-%{_arch}.conf << EOF
+%{install_libdir}
+EOF
+
+# Remove opt-viewer, since this is just a compatibility package.
+rm -Rf %{build_install_prefix}/share/opt-viewer
+
+mkdir -p %{buildroot}%{install_datadir}/llvm/cmake
+cp -Rv ../cmake/* %{buildroot}%{install_datadir}/llvm/cmake
+
+%check
+%if %{with check}
+LD_LIBRARY_PATH=%{buildroot}/%{install_libdir}  %{__ninja} check-all -C ./_build/
+%endif
+
+%ldconfig_scriptlets libs
+
+%files
+%license LICENSE.TXT
+%{install_bindir}/*
+%exclude %{install_bindir}/not
+%exclude %{install_bindir}/count
+%exclude %{install_bindir}/yaml-bench
+%exclude %{install_bindir}/lli-child-target
+%exclude %{install_bindir}/llvm-isel-fuzzer
+%exclude %{install_bindir}/llvm-opt-fuzzer
+
+%files libs
+%license LICENSE.TXT
+%{install_libdir}/libLLVM-%{maj_ver}.so
+%config(noreplace) %{_sysconfdir}/ld.so.conf.d/%{name}-%{_arch}.conf
+%{install_libdir}/LLVMgold.so
+%{install_libdir}/libLLVM-%{maj_ver}.%{min_ver}*.so
+%{install_libdir}/libLTO.so*
+%exclude %{install_libdir}/libLTO.so
+%{install_libdir}/libRemarks.so*
+
+%files devel
+%license LICENSE.TXT
+%{install_includedir}/llvm
+%{install_includedir}/llvm-c
+%{install_libdir}/libLTO.so
+%{install_libdir}/libLLVM.so
+%{install_libdir}/cmake/llvm
+
+%files doc
+%license LICENSE.TXT
+%doc %{install_docdir}/html
+%{install_prefix}/share/man/man1/*
+
+%files static
+%license LICENSE.TXT
+%{install_libdir}/*.a
+%exclude %{install_libdir}/libLLVMTestingSupport.a
+%exclude %{install_libdir}/libLLVMTestingAnnotations.a
+
+%files cmake-utils
+%license LICENSE.TXT
+%{install_datadir}/llvm/cmake
+
+%files test
+%license LICENSE.TXT
+%{install_bindir}/not
+%{install_bindir}/count
+%{install_bindir}/yaml-bench
+%{install_bindir}/lli-child-target
+%{install_bindir}/llvm-isel-fuzzer
+%{install_bindir}/llvm-opt-fuzzer
+ 
+%files googletest
+%license LICENSE.TXT
+%{install_srcdir}/utils
+%{install_libdir}/libLLVMTestingSupport.a
+%{install_libdir}/libLLVMTestingAnnotations.a
+%{install_libdir}/libllvm_gtest.a
+%{install_libdir}/libllvm_gtest_main.a
+%{install_includedir}/llvm-gtest
+%{install_includedir}/llvm-gmock
+
+%changelog
+* Fri Nov 22 2024 xiajingze <xiajingze1@huawei.com> - 17.0.6-28
+- [AArch64] Support HiSilicon's HIP09 sched model
+
+* Wed Nov 20 2024 eastb233 <xiezhiheng@huawei.com> - 17.0.6-27
+- Find Python3 in default env PATH for ACPO
+
+* Wed Nov 20 2024 eastb233 <xiezhiheng@huawei.com> - 17.0.6-26
+- ACPO Infrastructure for ML integration into LLVM compiler
+
+* Wed Nov 20 2024 eastb233 <xiezhiheng@huawei.com> - 17.0.6-25
+- [LICM] Solve runtime error caused by the signal function.
+
+* Wed Nov 20 2024 eastb233 <xiezhiheng@huawei.com> - 17.0.6-24
+- [SimplifyLibCalls] Merge sqrt into the power of exp (#79146)
+
+* Tue Nov 19 2024 xiajingze <xiajingze1@huawei.com> - 17.0.6-23
+- [backport][Clang] Fix crash with -fzero-call-used-regs
+
+* Mon Nov 18 2024 xiajingze <xiajingze1@huawei.com> - 17.0.6-22
+- [AArch64] Delete hip09 macro
+
+* Mon Nov 18 2024 liyunfei <liyunfei33@huawei.net> - 17.0.6-21
+- Add arch restriction for BiSheng Autotuner
+
+* Mon Nov 18 2024 liyunfei <liyunfei33@huawei.net> - 17.0.6-20
+- [Backport] Simple check to ignore Inline asm fwait insertion
+
+* Mon Sep 23 2024 zhanglimin <zhanglimin@loongson.cn> - 17.0.6-19
+- [LoongArch] Backport some new support
+
+* Thu Sep 12 2024 xiajingze <xiajingze1@huawei.com> - 17.0.6-18
+- [AArch64] Support HiSilicon's HIP09 Processor
+
+* Wed Sep 11 2024 hongjinghao <hongjinghao@huawei.com> - 17.0.6-17
+- doc add Provides llvm-help
+
+* Tue Sep 10 2024 hongjinghao <hongjinghao@huawei.com> - 17.0.6-16
+- doc add Obsoletes llvm-help
+
+* Tue Sep 3 2024 hongjinghao <hongjinghao@huawei.com> - 17.0.6-15
+- mv man to doc subpackage
+
+* Mon Jul 22 2024 liyunfei <liyunfei33@huawei.com> - 17.0.6-14
+- Prevent environment variables from exceeding NAME_MAX.
+
+* Mon Jul 22 2024 liyunfei <liyunfei33@huawei.com> - 17.0.6-13
+- Disable toolchain_clang build for BiSheng Autotuner support temporary.
+
+* Tue Jul 16 2024 liyunfei <liyunfei33@huawei.com> - 17.0.6-12
+- Add BiSheng Autotuner support.
+
+* Fri Jul 5 2024 liyunfei <liyunfei33@huawei.com> - 17.0.6-11
+- Add toolchain_clang build support
+
+* Mon Apr 29 2024 wangqiang <wangqiang1@kylinos.cn> - 17.0.6-10
+- Update llvm-lit config to support macro `build_for_openeuler`
+
+* Sun Apr 21 2024 zhanglimin <zhanglimin@loongson.cn> - 17.0.6-9
+- Improve the support for atomic and __clear_cache
+
+* Wed Apr 17 2024 luofeng <luofeng13@huawei.com> - 17.0.6-8
+- Add the support for classic flang
+
+* Fri Apr 12 2024 liyunfei <liyunfei33@huawei.com> - 17.0.6-7
+- Backport patch to fix CVE-2024-31852
+
+* Thu Apr 11 2024 wangqiang <wangqiang1@kylinos.cn> - 17.0.6-6
+- Skip inline asm in inlining target feature check on X86
+
+* Tue Apr 09 2024 liyunfei <liyunfei33@huawei.com> - 17.0.6-5
+- Backport patch to fix CVE-2023-46049
+
+* Wed Apr 03 2024 zhanglimin <zhanglimin@loongson.cn> - 17.0.6-4
+- Improve the support for compiler-rt and fix some bugs on LoongArch
+
+* Fri Mar 29 2024 zhanglimin <zhanglimin@loongson.cn> - 17.0.6-3
+- Add the support for vector on LoongArch
+
+* Sat Mar 16 2024 zhanglimin <zhanglimin@loongson.cn> - 17.0.6-2
+- Supoort `relax` feature on LoongArch
+
+* Thu Nov 30 2023 zhoujing <zhoujing106@huawei.com> - 17.0.6-1
+- Update to 17.0.6
+
+* Thu Jul 13 2023 cf-zhao <zhaochuanfeng@huawei.com> -12.0.1-7
+- Disable check.
+
+* Sat Jul 08 2023 cf-zhao <zhaochuanfeng@huawei.com> -12.0.1-6
+- Make this spec file support both system-version and multi-version.
+
+* Tue Feb 14 2023 cf-zhao <zhaochuanfeng@huawei.com> - 12.0.1-5
+- Disable check temporarily to avoid a build error that `rmbuild` cannot
+- remove a file due to no permission.
+
+* Wed Dec 21 2022 eastb233 <xiezhiheng@huawei.com> - 12.0.1-4
+- Type: Compile Option
+- ID: NA
+- SUG: NA
+- DESC: Add -fPIE and -pie options
+
+* Tue Aug 23 2022 guopeilin <guopeilin1@huawei.com> - 12.0.1-3
+- Type: enhancement
+- ID: NA
+- SUG: NA
+- DESC: Delete the .so file of old version
+
+* Tue Feb 8 2022 zhangweiguo <zhangweiguo2@huawei.com> - 12.0.1-2
+- Type: enhancement
+- ID: NA
+- SUG: NA
+- DESC: Disabe DLLVM_BUILD_TEST
+
+* Mon Dec 13 2021 zoulin <zoulin13@huawei.com> - 12.0.1-1
+- Type: enhancement
+- ID: NA
+- SUG: NA
+- DESC: Update version to 12.0.1
+
+* Wed Sep 8 2021 zhangruifang <zhangruifang1@huawei.com> - 10.0.1-4
+- Type: enhancement
+- ID: NA
+- SUG: NA
+- DESC: Remove rpath
+
+* Wed Oct 14 2020 Hugel <gengqihu1@huawei.com> - 10.0.1-3
+- Type: enhancement
+- ID: NA
+- SUG: NA
+- DESC: Delete the .so file of old version
+
+* Tue Aug 18 2020 Liquor <lirui130@huawei.com> - 10.0.1-2
+- Type: bugfix
+- ID: NA
+- SUG: NA
+- DESC:Use -DLLVM_TARGETS_TO_BUILD=all in configure
+
+* Tue Jul 28 2020 Liquor <lirui130@huawei.com> - 10.0.1-1
+- Type: update
+- ID: NA
+- SUG: NA
+- DESC:update to 10.0.1
+
+* Wed Jul 22 2020 Hugel <gengqihu1@huawei.com> - 7.0.0-10
+- Type: enhancement
+- ID: NA
+- SUG: NA
+- DESC: Ensure that variant part discriminator is read by MetadataLoader
+        Fix Assembler/debug-info.ll
+
+* Wed Mar 18 2020 openEuler Buildteam <buildteam@openeuler.org> - 7.0.0-9
+- Type: enhancement
+- ID: NA
+- SUG: NA
+- DESC: add four patches
+
+* Sat Jan 11 2020 openEuler Buildteam <buildteam@openeuler.org> - 7.0.0-8
+- Type: enhancement
+- ID: NA
+- SUG: NA
+- DESC: remove unnecessary files
+
+* Tue Dec 31 2019 openEuler Buildteam <buildteam@openeuler.org> -7.0.0-7
+- Type: enhancement
+- ID: NA
+- SUG: NA
+- DESC: delete conflict files in llvm
+
+* Fri Nov 1 2019 jiangchuangang <jiangchuangang@huawei.com> -7.0.0-6
+- Type: enhancement
+- ID: NA
+- SUG: NA
+- DESC: add libs package
+
+* Mon Oct 28 2019 jiangchuangang <jiangchuangang@huawei.com> -7.0.0-5
+- Type: enhancement
+- ID: NA
+- SUG: NA
+- DESC: add test files
+
+* Sun Sep 29 2019 luhuaxin <luhuaxin@huawei.com> - 7.0.0-4
+- Type: enhancement
+- ID: NA
+- SUG: NA
+- DESC: add license file
+
+* Fri Sep 20 2019 luhuaxin <luhuaxin@huawei.com> - 7.0.0-3
+- Package init
diff --git a/third-party-17.0.6.src.tar.xz b/third-party-17.0.6.src.tar.xz
new file mode 100644
index 0000000..b38f5ac
Binary files /dev/null and b/third-party-17.0.6.src.tar.xz differ