From 7c128bbdac0da1767c239174e91af6f327845372 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Thu, 19 Oct 2023 13:56:17 +0800
Subject: Add fp8 for sd unet

---
 modules/cmd_args.py  | 1 +
 modules/sd_models.py | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'modules')

diff --git a/modules/cmd_args.py b/modules/cmd_args.py
index 4e602a84..0f14c71e 100644
--- a/modules/cmd_args.py
+++ b/modules/cmd_args.py
@@ -118,3 +118,4 @@ parser.add_argument('--timeout-keep-alive', type=int, default=30, help='set time
 parser.add_argument("--disable-all-extensions", action='store_true', help="prevent all extensions from running regardless of any other settings", default=False)
 parser.add_argument("--disable-extra-extensions", action='store_true', help="prevent all extensions except built-in from running regardless of any other settings", default=False)
 parser.add_argument("--skip-load-model-at-start", action='store_true', help="if load a model at web start, only take effect when --nowebui", )
+parser.add_argument("--opt-unet-fp8-storage", action='store_true', help="use fp8 for SD UNet to save vram", default=False)
diff --git a/modules/sd_models.py b/modules/sd_models.py
index 3b6cdea1..3b8ff820 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -391,6 +391,9 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
 
         devices.dtype_unet = torch.float16
         timer.record("apply half()")
+        if shared.cmd_opts.opt_unet_fp8_storage:
+            model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
+            timer.record("apply fp8 unet")
 
     devices.unet_needs_upcast = shared.cmd_opts.upcast_sampling and devices.dtype == torch.float16 and devices.dtype_unet == torch.float16
 
-- 
cgit v1.2.1


From 5f9ddfa46f28ca2aa9e0bd832f6bbd67069be63e Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Thu, 19 Oct 2023 23:57:22 +0800
Subject: Add sdxl only arg

---
 modules/cmd_args.py  | 1 +
 modules/sd_models.py | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'modules')

diff --git a/modules/cmd_args.py b/modules/cmd_args.py
index 0f14c71e..20bfb2c4 100644
--- a/modules/cmd_args.py
+++ b/modules/cmd_args.py
@@ -119,3 +119,4 @@ parser.add_argument("--disable-all-extensions", action='store_true', help="preve
 parser.add_argument("--disable-extra-extensions", action='store_true', help="prevent all extensions except built-in from running regardless of any other settings", default=False)
 parser.add_argument("--skip-load-model-at-start", action='store_true', help="if load a model at web start, only take effect when --nowebui", )
 parser.add_argument("--opt-unet-fp8-storage", action='store_true', help="use fp8 for SD UNet to save vram", default=False)
+parser.add_argument("--opt-unet-fp8-storage-xl", action='store_true', help="use fp8 for SD UNet to save vram", default=False)
diff --git a/modules/sd_models.py b/modules/sd_models.py
index 3b8ff820..08af128f 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -394,6 +394,9 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
         if shared.cmd_opts.opt_unet_fp8_storage:
             model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
             timer.record("apply fp8 unet")
+        elif model.is_sdxl and shared.cmd_opts.opt_unet_fp8_storage_xl:
+            model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
+            timer.record("apply fp8 unet for sdxl")
 
     devices.unet_needs_upcast = shared.cmd_opts.upcast_sampling and devices.dtype == torch.float16 and devices.dtype_unet == torch.float16
 
-- 
cgit v1.2.1


From eaa9f5162fbca2ebcb2682eb861bc7e5510a2b66 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Tue, 24 Oct 2023 01:49:05 +0800
Subject: Add CPU fp8 support

Since norm layer need fp32, I only convert the linear operation layer(conv2d/linear)

And TE have some pytorch function not support bf16 amp in CPU. I add a condition to indicate if the autocast is for unet.
---
 modules/devices.py    |  6 +++++-
 modules/processing.py |  2 +-
 modules/sd_models.py  | 20 ++++++++++++++++----
 3 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'modules')

diff --git a/modules/devices.py b/modules/devices.py
index 1d4eb563..0cd2b55d 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -71,6 +71,7 @@ def enable_tf32():
 errors.run(enable_tf32, "Enabling TF32")
 
 cpu: torch.device = torch.device("cpu")
+fp8: bool = False
 device: torch.device = None
 device_interrogate: torch.device = None
 device_gfpgan: torch.device = None
@@ -93,10 +94,13 @@ def cond_cast_float(input):
 nv_rng = None
 
 
-def autocast(disable=False):
+def autocast(disable=False, unet=False):
     if disable:
         return contextlib.nullcontext()
 
+    if unet and fp8 and device==cpu:
+        return torch.autocast("cpu", dtype=torch.bfloat16, enabled=True)
+
     if dtype == torch.float32 or shared.cmd_opts.precision == "full":
         return contextlib.nullcontext()
 
diff --git a/modules/processing.py b/modules/processing.py
index 40598f5c..2df8a7ea 100644
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -865,7 +865,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
             if p.n_iter > 1:
                 shared.state.job = f"Batch {n+1} out of {p.n_iter}"
 
-            with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast():
+            with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast(unet=True):
                 samples_ddim = p.sample(conditioning=p.c, unconditional_conditioning=p.uc, seeds=p.seeds, subseeds=p.subseeds, subseed_strength=p.subseed_strength, prompts=p.prompts)
 
             if getattr(samples_ddim, 'already_decoded', False):
diff --git a/modules/sd_models.py b/modules/sd_models.py
index 08af128f..c5fe57bf 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -391,12 +391,24 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
 
         devices.dtype_unet = torch.float16
         timer.record("apply half()")
-        if shared.cmd_opts.opt_unet_fp8_storage:
+
+    if shared.cmd_opts.opt_unet_fp8_storage:
+        enable_fp8 = True
+    elif model.is_sdxl and shared.cmd_opts.opt_unet_fp8_storage_xl:
+        enable_fp8 = True
+    
+    if enable_fp8:
+        devices.fp8 = True
+        if devices.device == devices.cpu:
+            for module in model.model.diffusion_model.modules():
+                if isinstance(module, torch.nn.Conv2d):
+                    module.to(torch.float8_e4m3fn)
+                elif isinstance(module, torch.nn.Linear):
+                    module.to(torch.float8_e4m3fn)
+            timer.record("apply fp8 unet for cpu")
+        else:
             model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
             timer.record("apply fp8 unet")
-        elif model.is_sdxl and shared.cmd_opts.opt_unet_fp8_storage_xl:
-            model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
-            timer.record("apply fp8 unet for sdxl")
 
     devices.unet_needs_upcast = shared.cmd_opts.upcast_sampling and devices.dtype == torch.float16 and devices.dtype_unet == torch.float16
 
-- 
cgit v1.2.1


From 9c1eba2af3a6f9cd6282b3a367656793cbe70c01 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Tue, 24 Oct 2023 02:11:27 +0800
Subject: Fix lint

---
 modules/sd_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'modules')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index c5fe57bf..44d4038b 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -396,7 +396,7 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
         enable_fp8 = True
     elif model.is_sdxl and shared.cmd_opts.opt_unet_fp8_storage_xl:
         enable_fp8 = True
-    
+
     if enable_fp8:
         devices.fp8 = True
         if devices.device == devices.cpu:
-- 
cgit v1.2.1


From 1df6c8bfec4715610d64684b6ad2fa38c76c1df6 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Wed, 25 Oct 2023 11:36:43 +0800
Subject: fp8 for TE

---
 modules/sd_models.py | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'modules')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index 44d4038b..69395294 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -407,6 +407,13 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
                     module.to(torch.float8_e4m3fn)
             timer.record("apply fp8 unet for cpu")
         else:
+            if model.is_sdxl:
+                cond_stage = model.conditioner
+            else:
+                cond_stage = model.cond_stage_model
+            for module in cond_stage.modules():
+                if isinstance(module, torch.nn.Linear):
+                    module.to(torch.float8_e4m3fn)
             model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
             timer.record("apply fp8 unet")
 
-- 
cgit v1.2.1


From 4830b251366436ee8499c003fe87e46ddb4a4581 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Wed, 25 Oct 2023 11:53:37 +0800
Subject: Fix alphas_cumprod dtype

---
 modules/sd_models.py | 1 +
 1 file changed, 1 insertion(+)

(limited to 'modules')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index 69395294..23660454 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -416,6 +416,7 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
                     module.to(torch.float8_e4m3fn)
             model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
             timer.record("apply fp8 unet")
+        model.alphas_cumprod = model.alphas_cumprod.to(torch.float32)
 
     devices.unet_needs_upcast = shared.cmd_opts.upcast_sampling and devices.dtype == torch.float16 and devices.dtype_unet == torch.float16
 
-- 
cgit v1.2.1


From bf5067f50ca32cd4764638702e3cc38bca8bfd8b Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Wed, 25 Oct 2023 12:54:28 +0800
Subject: Fix alphas cumprod

---
 modules/sd_models.py    | 3 ++-
 modules/sd_models_xl.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'modules')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index 23660454..7ed89a9c 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -396,6 +396,8 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
         enable_fp8 = True
     elif model.is_sdxl and shared.cmd_opts.opt_unet_fp8_storage_xl:
         enable_fp8 = True
+    else:
+        enable_fp8 = False
 
     if enable_fp8:
         devices.fp8 = True
@@ -416,7 +418,6 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
                     module.to(torch.float8_e4m3fn)
             model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
             timer.record("apply fp8 unet")
-        model.alphas_cumprod = model.alphas_cumprod.to(torch.float32)
 
     devices.unet_needs_upcast = shared.cmd_opts.upcast_sampling and devices.dtype == torch.float16 and devices.dtype_unet == torch.float16
 
diff --git a/modules/sd_models_xl.py b/modules/sd_models_xl.py
index 01123321..11259a36 100644
--- a/modules/sd_models_xl.py
+++ b/modules/sd_models_xl.py
@@ -93,7 +93,7 @@ def extend_sdxl(model):
     model.parameterization = "v" if isinstance(model.denoiser.scaling, sgm.modules.diffusionmodules.denoiser_scaling.VScaling) else "eps"
 
     discretization = sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization()
-    model.alphas_cumprod = torch.asarray(discretization.alphas_cumprod, device=devices.device, dtype=dtype)
+    model.alphas_cumprod = torch.asarray(discretization.alphas_cumprod, device=devices.device, dtype=torch.float32)
 
     model.conditioner.wrapped = torch.nn.Module()
 
-- 
cgit v1.2.1


From dda067f64d3289cee3ffd65767126cb30ae73b13 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Wed, 25 Oct 2023 19:53:22 +0800
Subject: ignore mps for fp8

---
 modules/sd_models.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'modules')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index 7ed89a9c..ccb6afd2 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -392,7 +392,9 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
         devices.dtype_unet = torch.float16
         timer.record("apply half()")
 
-    if shared.cmd_opts.opt_unet_fp8_storage:
+    if devices.get_optimal_device_name() == "mps":
+        enable_fp8 = False
+    elif shared.cmd_opts.opt_unet_fp8_storage:
         enable_fp8 = True
     elif model.is_sdxl and shared.cmd_opts.opt_unet_fp8_storage_xl:
         enable_fp8 = True
-- 
cgit v1.2.1


From 0beb131c7ffae6f756a6339206da311232a36970 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Wed, 25 Oct 2023 20:07:37 +0800
Subject: change torch version

---
 modules/launch_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'modules')

diff --git a/modules/launch_utils.py b/modules/launch_utils.py
index 8cdbafa5..636da679 100644
--- a/modules/launch_utils.py
+++ b/modules/launch_utils.py
@@ -308,8 +308,8 @@ def requirements_met(requirements_file):
 
 
 def prepare_environment():
-    torch_index_url = os.environ.get('TORCH_INDEX_URL', "https://download.pytorch.org/whl/cu118")
-    torch_command = os.environ.get('TORCH_COMMAND', f"pip install torch==2.0.1 torchvision==0.15.2 --extra-index-url {torch_index_url}")
+    torch_index_url = os.environ.get('TORCH_INDEX_URL', "https://download.pytorch.org/whl/cu121")
+    torch_command = os.environ.get('TORCH_COMMAND', f"pip install torch==2.1.0 torchvision==0.16.0 --extra-index-url {torch_index_url}")
     requirements_file = os.environ.get('REQS_FILE', "requirements_versions.txt")
 
     xformers_package = os.environ.get('XFORMERS_PACKAGE', 'xformers==0.0.20')
-- 
cgit v1.2.1


From d4d3134f6d2d232c7bcfa80900a362921e644976 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Sat, 28 Oct 2023 15:24:26 +0800
Subject: ManualCast for 10/16 series gpu

---
 modules/devices.py    | 57 +++++++++++++++++++++++++++++++++++++++++++++------
 modules/processing.py |  2 +-
 modules/sd_models.py  | 21 +++++++++++--------
 3 files changed, 64 insertions(+), 16 deletions(-)

(limited to 'modules')

diff --git a/modules/devices.py b/modules/devices.py
index 0cd2b55d..c05f2b35 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -16,6 +16,23 @@ def has_mps() -> bool:
         return mac_specific.has_mps
 
 
+def cuda_no_autocast(device_id=None) -> bool:
+    if device_id is None:
+        device_id = get_cuda_device_id()
+    return (
+        torch.cuda.get_device_capability(device_id) == (7, 5) 
+        and torch.cuda.get_device_name(device_id).startswith("NVIDIA GeForce GTX 16")
+    )
+
+
+def get_cuda_device_id():
+    return (
+        int(shared.cmd_opts.device_id) 
+        if shared.cmd_opts.device_id is not None and shared.cmd_opts.device_id.isdigit() 
+        else 0
+    ) or torch.cuda.current_device()
+
+
 def get_cuda_device_string():
     if shared.cmd_opts.device_id is not None:
         return f"cuda:{shared.cmd_opts.device_id}"
@@ -60,8 +77,7 @@ def enable_tf32():
 
         # enabling benchmark option seems to enable a range of cards to do fp16 when they otherwise can't
         # see https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/4407
-        device_id = (int(shared.cmd_opts.device_id) if shared.cmd_opts.device_id is not None and shared.cmd_opts.device_id.isdigit() else 0) or torch.cuda.current_device()
-        if torch.cuda.get_device_capability(device_id) == (7, 5) and torch.cuda.get_device_name(device_id).startswith("NVIDIA GeForce GTX 16"):
+        if cuda_no_autocast():
             torch.backends.cudnn.benchmark = True
 
         torch.backends.cuda.matmul.allow_tf32 = True
@@ -92,15 +108,44 @@ def cond_cast_float(input):
 
 
 nv_rng = None
-
-
-def autocast(disable=False, unet=False):
+patch_module_list = [
+    torch.nn.Linear,
+    torch.nn.Conv2d,
+    torch.nn.MultiheadAttention,
+    torch.nn.GroupNorm,
+    torch.nn.LayerNorm,
+]
+
+@contextlib.contextmanager
+def manual_autocast():
+    def manual_cast_forward(self, *args, **kwargs):
+        org_dtype = next(self.parameters()).dtype
+        self.to(dtype)
+        result = self.org_forward(*args, **kwargs)
+        self.to(org_dtype)
+        return result
+    for module_type in patch_module_list:
+        org_forward = module_type.forward
+        module_type.forward = manual_cast_forward
+        module_type.org_forward = org_forward
+    try:
+        yield None
+    finally:
+        for module_type in patch_module_list:
+            module_type.forward = module_type.org_forward
+
+
+def autocast(disable=False):
+    print(fp8, dtype, shared.cmd_opts.precision, device)
     if disable:
         return contextlib.nullcontext()
 
-    if unet and fp8 and device==cpu:
+    if fp8 and device==cpu:
         return torch.autocast("cpu", dtype=torch.bfloat16, enabled=True)
 
+    if fp8 and (dtype == torch.float32 or shared.cmd_opts.precision == "full" or cuda_no_autocast()):
+        return manual_autocast()
+
     if dtype == torch.float32 or shared.cmd_opts.precision == "full":
         return contextlib.nullcontext()
 
diff --git a/modules/processing.py b/modules/processing.py
index 2df8a7ea..40598f5c 100644
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -865,7 +865,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
             if p.n_iter > 1:
                 shared.state.job = f"Batch {n+1} out of {p.n_iter}"
 
-            with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast(unet=True):
+            with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast():
                 samples_ddim = p.sample(conditioning=p.c, unconditional_conditioning=p.uc, seeds=p.seeds, subseeds=p.subseeds, subseed_strength=p.subseed_strength, prompts=p.prompts)
 
             if getattr(samples_ddim, 'already_decoded', False):
diff --git a/modules/sd_models.py b/modules/sd_models.py
index ccb6afd2..31bcb913 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -403,23 +403,26 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
 
     if enable_fp8:
         devices.fp8 = True
+        if model.is_sdxl:
+            cond_stage = model.conditioner
+        else:
+            cond_stage = model.cond_stage_model
+
+        for module in cond_stage.modules():
+            if isinstance(module, torch.nn.Linear):
+                module.to(torch.float8_e4m3fn)
+
         if devices.device == devices.cpu:
             for module in model.model.diffusion_model.modules():
                 if isinstance(module, torch.nn.Conv2d):
                     module.to(torch.float8_e4m3fn)
                 elif isinstance(module, torch.nn.Linear):
                     module.to(torch.float8_e4m3fn)
-            timer.record("apply fp8 unet for cpu")
         else:
-            if model.is_sdxl:
-                cond_stage = model.conditioner
-            else:
-                cond_stage = model.cond_stage_model
-            for module in cond_stage.modules():
-                if isinstance(module, torch.nn.Linear):
-                    module.to(torch.float8_e4m3fn)
             model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
-            timer.record("apply fp8 unet")
+        timer.record("apply fp8")
+    else:
+        devices.fp8 = False
 
     devices.unet_needs_upcast = shared.cmd_opts.upcast_sampling and devices.dtype == torch.float16 and devices.dtype_unet == torch.float16
 
-- 
cgit v1.2.1


From ddc2a3499b8cd120b4a42358bcd33137ce1d1e75 Mon Sep 17 00:00:00 2001
From: KohakuBlueleaf <apolloyeh0123@gmail.com>
Date: Sat, 28 Oct 2023 16:52:35 +0800
Subject: Add MPS manual cast

---
 modules/devices.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'modules')

diff --git a/modules/devices.py b/modules/devices.py
index c05f2b35..d7c905c2 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -121,6 +121,8 @@ def manual_autocast():
     def manual_cast_forward(self, *args, **kwargs):
         org_dtype = next(self.parameters()).dtype
         self.to(dtype)
+        args = [arg.to(dtype) if isinstance(arg, torch.Tensor) else arg for arg in args]
+        kwargs = {k: v.to(dtype) if isinstance(v, torch.Tensor) else v for k, v in kwargs.items()}
         result = self.org_forward(*args, **kwargs)
         self.to(org_dtype)
         return result
@@ -136,7 +138,6 @@ def manual_autocast():
 
 
 def autocast(disable=False):
-    print(fp8, dtype, shared.cmd_opts.precision, device)
     if disable:
         return contextlib.nullcontext()
 
@@ -146,6 +147,9 @@ def autocast(disable=False):
     if fp8 and (dtype == torch.float32 or shared.cmd_opts.precision == "full" or cuda_no_autocast()):
         return manual_autocast()
 
+    if has_mps() and shared.cmd_opts.precision != "full":
+        return manual_autocast()
+
     if dtype == torch.float32 or shared.cmd_opts.precision == "full":
         return contextlib.nullcontext()
 
-- 
cgit v1.2.1


From 598da5cd4928618b166886d3485ce30ce3a43490 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Sun, 19 Nov 2023 15:50:06 +0800
Subject: Use options instead of cmd_args

---
 modules/cmd_args.py        |  2 --
 modules/devices.py         | 25 ++++++++++---------
 modules/initialize_util.py |  1 +
 modules/sd_models.py       | 61 ++++++++++++++++++++++++----------------------
 modules/shared_options.py  |  1 +
 5 files changed, 48 insertions(+), 42 deletions(-)

(limited to 'modules')

diff --git a/modules/cmd_args.py b/modules/cmd_args.py
index 088d5dea..a9fb9bfa 100644
--- a/modules/cmd_args.py
+++ b/modules/cmd_args.py
@@ -118,5 +118,3 @@ parser.add_argument('--timeout-keep-alive', type=int, default=30, help='set time
 parser.add_argument("--disable-all-extensions", action='store_true', help="prevent all extensions from running regardless of any other settings", default=False)
 parser.add_argument("--disable-extra-extensions", action='store_true', help="prevent all extensions except built-in from running regardless of any other settings", default=False)
 parser.add_argument("--skip-load-model-at-start", action='store_true', help="if load a model at web start, only take effect when --nowebui", )
-parser.add_argument("--opt-unet-fp8-storage", action='store_true', help="use fp8 for SD UNet to save vram", default=False)
-parser.add_argument("--opt-unet-fp8-storage-xl", action='store_true', help="use fp8 for SD UNet to save vram", default=False)
diff --git a/modules/devices.py b/modules/devices.py
index d7c905c2..03e7bdb7 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -20,15 +20,15 @@ def cuda_no_autocast(device_id=None) -> bool:
     if device_id is None:
         device_id = get_cuda_device_id()
     return (
-        torch.cuda.get_device_capability(device_id) == (7, 5) 
+        torch.cuda.get_device_capability(device_id) == (7, 5)
         and torch.cuda.get_device_name(device_id).startswith("NVIDIA GeForce GTX 16")
     )
 
 
 def get_cuda_device_id():
     return (
-        int(shared.cmd_opts.device_id) 
-        if shared.cmd_opts.device_id is not None and shared.cmd_opts.device_id.isdigit() 
+        int(shared.cmd_opts.device_id)
+        if shared.cmd_opts.device_id is not None and shared.cmd_opts.device_id.isdigit()
         else 0
     ) or torch.cuda.current_device()
 
@@ -116,16 +116,19 @@ patch_module_list = [
     torch.nn.LayerNorm,
 ]
 
+
+def manual_cast_forward(self, *args, **kwargs):
+    org_dtype = next(self.parameters()).dtype
+    self.to(dtype)
+    args = [arg.to(dtype) if isinstance(arg, torch.Tensor) else arg for arg in args]
+    kwargs = {k: v.to(dtype) if isinstance(v, torch.Tensor) else v for k, v in kwargs.items()}
+    result = self.org_forward(*args, **kwargs)
+    self.to(org_dtype)
+    return result
+
+
 @contextlib.contextmanager
 def manual_autocast():
-    def manual_cast_forward(self, *args, **kwargs):
-        org_dtype = next(self.parameters()).dtype
-        self.to(dtype)
-        args = [arg.to(dtype) if isinstance(arg, torch.Tensor) else arg for arg in args]
-        kwargs = {k: v.to(dtype) if isinstance(v, torch.Tensor) else v for k, v in kwargs.items()}
-        result = self.org_forward(*args, **kwargs)
-        self.to(org_dtype)
-        return result
     for module_type in patch_module_list:
         org_forward = module_type.forward
         module_type.forward = manual_cast_forward
diff --git a/modules/initialize_util.py b/modules/initialize_util.py
index 2e9b6d89..1b11ead6 100644
--- a/modules/initialize_util.py
+++ b/modules/initialize_util.py
@@ -177,6 +177,7 @@ def configure_opts_onchange():
     shared.opts.onchange("temp_dir", ui_tempdir.on_tmpdir_changed)
     shared.opts.onchange("gradio_theme", shared.reload_gradio_theme)
     shared.opts.onchange("cross_attention_optimization", wrap_queued_call(lambda: sd_hijack.model_hijack.redo_hijack(shared.sd_model)), call=False)
+    shared.opts.onchange("fp8_storage", wrap_queued_call(lambda: sd_models.reload_model_weights()), call=False)
     startup_timer.record("opts onchange")
 
 
diff --git a/modules/sd_models.py b/modules/sd_models.py
index a6c8b2fa..eb491434 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -339,10 +339,28 @@ class SkipWritingToConfig:
         SkipWritingToConfig.skip = self.previous
 
 
+def check_fp8(model):
+    if model is None:
+        return None
+    if devices.get_optimal_device_name() == "mps":
+        enable_fp8 = False
+    elif shared.opts.fp8_storage == "Enable":
+        enable_fp8 = True
+    elif getattr(model, "is_sdxl", False) and shared.opts.fp8_storage == "Enable for SDXL":
+        enable_fp8 = True
+    else:
+        enable_fp8 = False
+    return enable_fp8
+
+
 def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer):
     sd_model_hash = checkpoint_info.calculate_shorthash()
     timer.record("calculate hash")
 
+    if not check_fp8(model) and devices.fp8:
+        # prevent model to load state dict in fp8
+        model.half()
+
     if not SkipWritingToConfig.skip:
         shared.opts.data["sd_model_checkpoint"] = checkpoint_info.title
 
@@ -395,34 +413,16 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
         devices.dtype_unet = torch.float16
         timer.record("apply half()")
 
-    if devices.get_optimal_device_name() == "mps":
-        enable_fp8 = False
-    elif shared.cmd_opts.opt_unet_fp8_storage:
-        enable_fp8 = True
-    elif model.is_sdxl and shared.cmd_opts.opt_unet_fp8_storage_xl:
-        enable_fp8 = True
-    else:
-        enable_fp8 = False
-
-    if enable_fp8:
+    if check_fp8(model):
         devices.fp8 = True
-        if model.is_sdxl:
-            cond_stage = model.conditioner
-        else:
-            cond_stage = model.cond_stage_model
-
-        for module in cond_stage.modules():
-            if isinstance(module, torch.nn.Linear):
+        first_stage = model.first_stage_model
+        model.first_stage_model = None
+        for module in model.modules():
+            if isinstance(module, torch.nn.Conv2d):
                 module.to(torch.float8_e4m3fn)
-
-        if devices.device == devices.cpu:
-            for module in model.model.diffusion_model.modules():
-                if isinstance(module, torch.nn.Conv2d):
-                    module.to(torch.float8_e4m3fn)
-                elif isinstance(module, torch.nn.Linear):
-                    module.to(torch.float8_e4m3fn)
-        else:
-            model.model.diffusion_model = model.model.diffusion_model.to(torch.float8_e4m3fn)
+            elif isinstance(module, torch.nn.Linear):
+                module.to(torch.float8_e4m3fn)
+        model.first_stage_model = first_stage
         timer.record("apply fp8")
     else:
         devices.fp8 = False
@@ -769,7 +769,7 @@ def reuse_model_from_already_loaded(sd_model, checkpoint_info, timer):
         return None
 
 
-def reload_model_weights(sd_model=None, info=None):
+def reload_model_weights(sd_model=None, info=None, forced_reload=False):
     checkpoint_info = info or select_checkpoint()
 
     timer = Timer()
@@ -781,11 +781,14 @@ def reload_model_weights(sd_model=None, info=None):
         current_checkpoint_info = None
     else:
         current_checkpoint_info = sd_model.sd_checkpoint_info
-        if sd_model.sd_model_checkpoint == checkpoint_info.filename:
+        if check_fp8(sd_model) != devices.fp8:
+            # load from state dict again to prevent extra numerical errors
+            forced_reload = True
+        elif sd_model.sd_model_checkpoint == checkpoint_info.filename:
             return sd_model
 
     sd_model = reuse_model_from_already_loaded(sd_model, checkpoint_info, timer)
-    if sd_model is not None and sd_model.sd_checkpoint_info.filename == checkpoint_info.filename:
+    if not forced_reload and sd_model is not None and sd_model.sd_checkpoint_info.filename == checkpoint_info.filename:
         return sd_model
 
     if sd_model is not None:
diff --git a/modules/shared_options.py b/modules/shared_options.py
index f1003f21..d27f35e9 100644
--- a/modules/shared_options.py
+++ b/modules/shared_options.py
@@ -200,6 +200,7 @@ options_templates.update(options_section(('optimizations', "Optimizations"), {
     "pad_cond_uncond": OptionInfo(False, "Pad prompt/negative prompt to be same length", infotext='Pad conds').info("improves performance when prompt and negative prompt have different lengths; changes seeds"),
     "persistent_cond_cache": OptionInfo(True, "Persistent cond cache").info("do not recalculate conds from prompts if prompts have not changed since previous calculation"),
     "batch_cond_uncond": OptionInfo(True, "Batch cond/uncond").info("do both conditional and unconditional denoising in one batch; uses a bit more VRAM during sampling, but improves speed; previously this was controlled by --always-batch-cond-uncond comandline argument"),
+    "fp8_storage": OptionInfo("Disable", "FP8 weight", gr.Dropdown, {"choices": ["Disable", "Enable for SDXL", "Enable"]}).info("Use FP8 to store Linear/Conv layers' weight. Require pytorch>=2.1.0."),
 }))
 
 options_templates.update(options_section(('compatibility', "Compatibility"), {
-- 
cgit v1.2.1


From 890181e1d456b613bf60f6e8378dc68b39011af9 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Sun, 19 Nov 2023 15:54:39 +0800
Subject: Update the xformers/torch versions

---
 modules/errors.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'modules')

diff --git a/modules/errors.py b/modules/errors.py
index 8c339464..a3498c11 100644
--- a/modules/errors.py
+++ b/modules/errors.py
@@ -93,8 +93,8 @@ def check_versions():
     import torch
     import gradio
 
-    expected_torch_version = "2.0.0"
-    expected_xformers_version = "0.0.20"
+    expected_torch_version = "2.1.0"
+    expected_xformers_version = "0.0.22.post7"
     expected_gradio_version = "3.41.2"
 
     if version.parse(torch.__version__) < version.parse(expected_torch_version):
-- 
cgit v1.2.1


From f383af2729ec2d1969200218577ab19dd78f7d48 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Sun, 19 Nov 2023 15:56:23 +0800
Subject: update xformers/torch versions

---
 modules/launch_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'modules')

diff --git a/modules/launch_utils.py b/modules/launch_utils.py
index 636da679..c225bbc1 100644
--- a/modules/launch_utils.py
+++ b/modules/launch_utils.py
@@ -312,7 +312,7 @@ def prepare_environment():
     torch_command = os.environ.get('TORCH_COMMAND', f"pip install torch==2.1.0 torchvision==0.16.0 --extra-index-url {torch_index_url}")
     requirements_file = os.environ.get('REQS_FILE', "requirements_versions.txt")
 
-    xformers_package = os.environ.get('XFORMERS_PACKAGE', 'xformers==0.0.20')
+    xformers_package = os.environ.get('XFORMERS_PACKAGE', 'xformers==0.0.22.post7')
     clip_package = os.environ.get('CLIP_PACKAGE', "https://github.com/openai/CLIP/archive/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1.zip")
     openclip_package = os.environ.get('OPENCLIP_PACKAGE', "https://github.com/mlfoundations/open_clip/archive/bb6e834e9c70d9c27d0dc3ecedeebeaeb1ffad6b.zip")
 
-- 
cgit v1.2.1


From 043d2edcf6a543f236f1f3cb70ac72e7b3b357b6 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Sun, 19 Nov 2023 15:56:31 +0800
Subject: Better naming

---
 modules/devices.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'modules')

diff --git a/modules/devices.py b/modules/devices.py
index 03e7bdb7..c19a7f40 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -128,7 +128,7 @@ def manual_cast_forward(self, *args, **kwargs):
 
 
 @contextlib.contextmanager
-def manual_autocast():
+def manual_cast():
     for module_type in patch_module_list:
         org_forward = module_type.forward
         module_type.forward = manual_cast_forward
@@ -148,10 +148,10 @@ def autocast(disable=False):
         return torch.autocast("cpu", dtype=torch.bfloat16, enabled=True)
 
     if fp8 and (dtype == torch.float32 or shared.cmd_opts.precision == "full" or cuda_no_autocast()):
-        return manual_autocast()
+        return manual_cast()
 
     if has_mps() and shared.cmd_opts.precision != "full":
-        return manual_autocast()
+        return manual_cast()
 
     if dtype == torch.float32 or shared.cmd_opts.precision == "full":
         return contextlib.nullcontext()
-- 
cgit v1.2.1


From 370a77f8e78e65a8a1339289d684cb43df142f70 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Tue, 21 Nov 2023 19:59:34 +0800
Subject: Option for using fp16 weight when apply lora

---
 modules/initialize_util.py |  1 +
 modules/sd_models.py       | 14 +++++++++++---
 modules/shared_options.py  |  1 +
 3 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'modules')

diff --git a/modules/initialize_util.py b/modules/initialize_util.py
index 1b11ead6..7fb1d8d5 100644
--- a/modules/initialize_util.py
+++ b/modules/initialize_util.py
@@ -178,6 +178,7 @@ def configure_opts_onchange():
     shared.opts.onchange("gradio_theme", shared.reload_gradio_theme)
     shared.opts.onchange("cross_attention_optimization", wrap_queued_call(lambda: sd_hijack.model_hijack.redo_hijack(shared.sd_model)), call=False)
     shared.opts.onchange("fp8_storage", wrap_queued_call(lambda: sd_models.reload_model_weights()), call=False)
+    shared.opts.onchange("cache_fp16_weight", wrap_queued_call(lambda: sd_models.reload_model_weights()), call=False)
     startup_timer.record("opts onchange")
 
 
diff --git a/modules/sd_models.py b/modules/sd_models.py
index eb491434..0a7777f1 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -413,14 +413,22 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
         devices.dtype_unet = torch.float16
         timer.record("apply half()")
 
+    for module in model.modules():
+        if hasattr(module, 'fp16_weight'):
+            del module.fp16_weight
+        if hasattr(module, 'fp16_bias'):
+            del module.fp16_bias
+
     if check_fp8(model):
         devices.fp8 = True
         first_stage = model.first_stage_model
         model.first_stage_model = None
         for module in model.modules():
-            if isinstance(module, torch.nn.Conv2d):
-                module.to(torch.float8_e4m3fn)
-            elif isinstance(module, torch.nn.Linear):
+            if isinstance(module, (torch.nn.Conv2d, torch.nn.Linear)):
+                if shared.opts.cache_fp16_weight:
+                    module.fp16_weight = module.weight.clone().half()
+                    if module.bias is not None:
+                        module.fp16_bias = module.bias.clone().half()
                 module.to(torch.float8_e4m3fn)
         model.first_stage_model = first_stage
         timer.record("apply fp8")
diff --git a/modules/shared_options.py b/modules/shared_options.py
index d27f35e9..eaa9f135 100644
--- a/modules/shared_options.py
+++ b/modules/shared_options.py
@@ -201,6 +201,7 @@ options_templates.update(options_section(('optimizations', "Optimizations"), {
     "persistent_cond_cache": OptionInfo(True, "Persistent cond cache").info("do not recalculate conds from prompts if prompts have not changed since previous calculation"),
     "batch_cond_uncond": OptionInfo(True, "Batch cond/uncond").info("do both conditional and unconditional denoising in one batch; uses a bit more VRAM during sampling, but improves speed; previously this was controlled by --always-batch-cond-uncond comandline argument"),
     "fp8_storage": OptionInfo("Disable", "FP8 weight", gr.Dropdown, {"choices": ["Disable", "Enable for SDXL", "Enable"]}).info("Use FP8 to store Linear/Conv layers' weight. Require pytorch>=2.1.0."),
+    "cache_fp16_weight": OptionInfo(False, "Cache FP16 weight for LoRA").info("Cache fp16 weight when enabling FP8, will increase the quality of LoRA. Use more system ram."),
 }))
 
 options_templates.update(options_section(('compatibility', "Compatibility"), {
-- 
cgit v1.2.1


From f5d719d1f1baa775d838aa75d9af1971bcc78e8f Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Wed, 22 Nov 2023 01:45:56 +0800
Subject: Add forced reload for fp16 cache

---
 modules/initialize_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'modules')

diff --git a/modules/initialize_util.py b/modules/initialize_util.py
index 7fb1d8d5..b6767138 100644
--- a/modules/initialize_util.py
+++ b/modules/initialize_util.py
@@ -178,7 +178,7 @@ def configure_opts_onchange():
     shared.opts.onchange("gradio_theme", shared.reload_gradio_theme)
     shared.opts.onchange("cross_attention_optimization", wrap_queued_call(lambda: sd_hijack.model_hijack.redo_hijack(shared.sd_model)), call=False)
     shared.opts.onchange("fp8_storage", wrap_queued_call(lambda: sd_models.reload_model_weights()), call=False)
-    shared.opts.onchange("cache_fp16_weight", wrap_queued_call(lambda: sd_models.reload_model_weights()), call=False)
+    shared.opts.onchange("cache_fp16_weight", wrap_queued_call(lambda: sd_models.reload_model_weights(forced_reload=True)), call=False)
     startup_timer.record("opts onchange")
 
 
-- 
cgit v1.2.1


From 40ac134c553ac824d4a96666bba14d550300daa5 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Sat, 25 Nov 2023 12:35:09 +0800
Subject: Fix pre-fp8

---
 modules/sd_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'modules')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index 0a7777f1..90437c87 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -357,7 +357,7 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
     sd_model_hash = checkpoint_info.calculate_shorthash()
     timer.record("calculate hash")
 
-    if not check_fp8(model) and devices.fp8:
+    if devices.fp8:
         # prevent model to load state dict in fp8
         model.half()
 
-- 
cgit v1.2.1


From 50a21cb09fe3e9ea2d4fe058e0484e192c8a86e3 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Sat, 2 Dec 2023 22:06:47 +0800
Subject: Ensure the cached weight will not be affected

---
 modules/sd_models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'modules')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index 4b8a9ae6..dcf816b3 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -435,9 +435,9 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
         for module in model.modules():
             if isinstance(module, (torch.nn.Conv2d, torch.nn.Linear)):
                 if shared.opts.cache_fp16_weight:
-                    module.fp16_weight = module.weight.clone().half()
+                    module.fp16_weight = module.weight.data.clone().cpu().half()
                     if module.bias is not None:
-                        module.fp16_bias = module.bias.clone().half()
+                        module.fp16_bias = module.bias.data.clone().cpu().half()
                 module.to(torch.float8_e4m3fn)
         model.first_stage_model = first_stage
         timer.record("apply fp8")
-- 
cgit v1.2.1


From 672dc4efa8e0da38426b121e7c7216d0a8e465fd Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Wed, 6 Dec 2023 15:16:10 +0800
Subject: Fix forced reload

---
 modules/sd_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'modules')

diff --git a/modules/sd_models.py b/modules/sd_models.py
index dcf816b3..d0046f88 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -801,7 +801,7 @@ def reload_model_weights(sd_model=None, info=None, forced_reload=False):
         if check_fp8(sd_model) != devices.fp8:
             # load from state dict again to prevent extra numerical errors
             forced_reload = True
-        elif sd_model.sd_model_checkpoint == checkpoint_info.filename:
+        elif sd_model.sd_model_checkpoint == checkpoint_info.filename and not forced_reload:
             return sd_model
 
     sd_model = reuse_model_from_already_loaded(sd_model, checkpoint_info, timer)
-- 
cgit v1.2.1


From ea272152e0b50dbb2bd675ec020607f3d50c37d0 Mon Sep 17 00:00:00 2001
From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com>
Date: Sat, 16 Dec 2023 15:08:08 +0800
Subject: Add FP8 settings into PNG info

---
 modules/generation_parameters_copypaste.py | 6 ++++++
 modules/processing.py                      | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'modules')

diff --git a/modules/generation_parameters_copypaste.py b/modules/generation_parameters_copypaste.py
index 4efe53e0..dbffe494 100644
--- a/modules/generation_parameters_copypaste.py
+++ b/modules/generation_parameters_copypaste.py
@@ -314,6 +314,12 @@ Steps: 20, Sampler: Euler a, CFG scale: 7, Seed: 965400086, Size: 512x512, Model
     if "VAE Decoder" not in res:
         res["VAE Decoder"] = "Full"
 
+    if "FP8 weight" not in res:
+        res["FP8 weight"] = "Disable"
+
+    if "Cache FP16 weight for LoRA" not in res and res["FP8 weight"] != "Disable":
+        res["Cache FP16 weight for LoRA"] = False
+
     skip = set(shared.opts.infotext_skip_pasting)
     res = {k: v for k, v in res.items() if k not in skip}
 
diff --git a/modules/processing.py b/modules/processing.py
index bea01ec6..179f2c0f 100644
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -688,6 +688,8 @@ def create_infotext(p, all_prompts, all_seeds, all_subseeds, comments=None, iter
         "Size": f"{p.width}x{p.height}",
         "Model hash": p.sd_model_hash if opts.add_model_hash_to_info else None,
         "Model": p.sd_model_name if opts.add_model_name_to_info else None,
+        "FP8 weight": opts.fp8_storage if devices.fp8 else None,
+        "Cache FP16 weight for LoRA": opts.cache_fp16_weight if devices.fp8 else None,
         "VAE hash": p.sd_vae_hash if opts.add_vae_hash_to_info else None,
         "VAE": p.sd_vae_name if opts.add_vae_name_to_info else None,
         "Variation seed": (None if p.subseed_strength == 0 else (p.all_subseeds[0] if use_main_prompt else all_subseeds[index])),
-- 
cgit v1.2.1