chore: update pre-commit hooks (#2745)
* chore: update pre-commit hooks * trigger linter when pre commit hooks are updated * fix type checks from upgraded pre-commit --------- Co-authored-by: djsaunde <1245942+djsaunde@users.noreply.github.com> Co-authored-by: Wing Lian <wing@axolotl.ai>
This commit is contained in:
committed by
GitHub
parent
ecc719f5c7
commit
94219f6ee8
1
.github/workflows/lint.yml
vendored
1
.github/workflows/lint.yml
vendored
@@ -9,6 +9,7 @@ on:
|
||||
- '.github/workflows/*.yml'
|
||||
- "*.[q]md"
|
||||
- "examples/**/*.y[a]?ml"
|
||||
- ".pre-commit-config.yaml"
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
|
||||
@@ -27,7 +27,7 @@ repos:
|
||||
hooks:
|
||||
- id: pylint
|
||||
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||
rev: v1.15.0
|
||||
rev: v1.16.0
|
||||
hooks:
|
||||
- id: mypy
|
||||
additional_dependencies:
|
||||
|
||||
@@ -280,19 +280,19 @@ class LoRA_MLP(torch.autograd.Function):
|
||||
# Initialize and compute LoRA gradients
|
||||
d_down_A = d_down_B = d_up_A = d_up_B = d_gate_A = d_gate_B = None
|
||||
|
||||
if down_A is not None:
|
||||
if down_A is not None and down_B is not None:
|
||||
d_down_A = h.t() @ (grad_output @ down_B.t())
|
||||
d_down_B = (down_A.t() @ h.t()) @ grad_output
|
||||
d_down_A *= down_scale
|
||||
d_down_B *= down_scale
|
||||
|
||||
if up_A is not None:
|
||||
if up_A is not None and up_B is not None:
|
||||
d_up_A = X.t() @ (grad_up @ up_B.t())
|
||||
d_up_B = (up_A.t() @ X.t()) @ grad_up
|
||||
d_up_A *= up_scale
|
||||
d_up_B *= up_scale
|
||||
|
||||
if gate_A is not None:
|
||||
if gate_A is not None and gate_B is not None:
|
||||
d_gate_A = X.t() @ (grad_gate @ gate_B.t())
|
||||
d_gate_B = (gate_A.t() @ X.t()) @ grad_gate
|
||||
d_gate_A *= gate_scale
|
||||
@@ -311,7 +311,7 @@ class LoRA_MLP(torch.autograd.Function):
|
||||
del up_weight
|
||||
|
||||
# Note the .to(dtype) only where mixing LoRA with base weights
|
||||
if up_A is not None:
|
||||
if up_A is not None and up_B is not None:
|
||||
dX += grad_up @ up_B.to(dtype).t() @ (up_scale * up_A.to(dtype).t())
|
||||
|
||||
# Gate projection gradients
|
||||
@@ -319,7 +319,7 @@ class LoRA_MLP(torch.autograd.Function):
|
||||
dX += grad_gate @ gate_weight.t()
|
||||
del gate_weight
|
||||
|
||||
if gate_A is not None:
|
||||
if gate_A is not None and gate_B is not None:
|
||||
dX += (
|
||||
grad_gate
|
||||
@ gate_B.to(dtype).t()
|
||||
|
||||
Reference in New Issue
Block a user