19 files changed, 341 insertions, 69 deletions
diff --git a/modules/api/api.py b/modules/api/api.py
index b3d74e51..9637cb81 100644
--- a/modules/api/api.py
+++ b/modules/api/api.py
@@ -31,7 +31,7 @@ from typing import Any
 import piexif
 import piexif.helper
 from contextlib import closing
-
+from modules.progress import create_task_id, add_task_to_queue, start_task, finish_task, current_task
 
 def script_name_to_index(name, scripts):
     try:
@@ -336,6 +336,9 @@ class Api:
         return script_args
 
     def text2imgapi(self, txt2imgreq: models.StableDiffusionTxt2ImgProcessingAPI):
+        task_id = create_task_id("text2img")
+        if txt2imgreq.force_task_id is None:
+            task_id = txt2imgreq.force_task_id
         script_runner = scripts.scripts_txt2img
         if not script_runner.scripts:
             script_runner.initialize_scripts(False)
@@ -362,6 +365,8 @@ class Api:
         send_images = args.pop('send_images', True)
         args.pop('save_images', None)
 
+        add_task_to_queue(task_id)
+
         with self.queue_lock:
             with closing(StableDiffusionProcessingTxt2Img(sd_model=shared.sd_model, **args)) as p:
                 p.is_api = True
@@ -371,12 +376,14 @@ class Api:
 
                 try:
                     shared.state.begin(job="scripts_txt2img")
+                    start_task(task_id)
                     if selectable_scripts is not None:
                         p.script_args = script_args
                         processed = scripts.scripts_txt2img.run(p, *p.script_args) # Need to pass args as list here
                     else:
                         p.script_args = tuple(script_args) # Need to pass args as tuple here
                         processed = process_images(p)
+                    finish_task(task_id)
                 finally:
                     shared.state.end()
                     shared.total_tqdm.clear()
@@ -386,6 +393,10 @@ class Api:
         return models.TextToImageResponse(images=b64images, parameters=vars(txt2imgreq), info=processed.js())
 
     def img2imgapi(self, img2imgreq: models.StableDiffusionImg2ImgProcessingAPI):
+        task_id = create_task_id("img2img")
+        if img2imgreq.force_task_id is None:
+            task_id = img2imgreq.force_task_id
+
         init_images = img2imgreq.init_images
         if init_images is None:
             raise HTTPException(status_code=404, detail="Init image not found")
@@ -422,6 +433,8 @@ class Api:
         send_images = args.pop('send_images', True)
         args.pop('save_images', None)
 
+        add_task_to_queue(task_id)
+
         with self.queue_lock:
             with closing(StableDiffusionProcessingImg2Img(sd_model=shared.sd_model, **args)) as p:
                 p.init_images = [decode_base64_to_image(x) for x in init_images]
@@ -432,12 +445,14 @@ class Api:
 
                 try:
                     shared.state.begin(job="scripts_img2img")
+                    start_task(task_id)
                     if selectable_scripts is not None:
                         p.script_args = script_args
                         processed = scripts.scripts_img2img.run(p, *p.script_args) # Need to pass args as list here
                     else:
                         p.script_args = tuple(script_args) # Need to pass args as tuple here
                         processed = process_images(p)
+                    finish_task(task_id)
                 finally:
                     shared.state.end()
                     shared.total_tqdm.clear()
@@ -511,7 +526,7 @@ class Api:
         if shared.state.current_image and not req.skip_current_image:
             current_image = encode_pil_to_base64(shared.state.current_image)
 
-        return models.ProgressResponse(progress=progress, eta_relative=eta_relative, state=shared.state.dict(), current_image=current_image, textinfo=shared.state.textinfo)
+        return models.ProgressResponse(progress=progress, eta_relative=eta_relative, state=shared.state.dict(), current_image=current_image, textinfo=shared.state.textinfo, current_task=current_task)
 
     def interrogateapi(self, interrogatereq: models.InterrogateRequest):
         image_b64 = interrogatereq.image
diff --git a/modules/api/models.py b/modules/api/models.py
index 33894b3e..58083a34 100644
--- a/modules/api/models.py
+++ b/modules/api/models.py
@@ -107,6 +107,7 @@ StableDiffusionTxt2ImgProcessingAPI = PydanticModelGenerator(
         {"key": "send_images", "type": bool, "default": True},
         {"key": "save_images", "type": bool, "default": False},
         {"key": "alwayson_scripts", "type": dict, "default": {}},
+        {"key": "force_task_id", "type": str, "default": None},
     ]
 ).generate_model()
 
@@ -124,6 +125,7 @@ StableDiffusionImg2ImgProcessingAPI = PydanticModelGenerator(
         {"key": "send_images", "type": bool, "default": True},
         {"key": "save_images", "type": bool, "default": False},
         {"key": "alwayson_scripts", "type": dict, "default": {}},
+        {"key": "force_task_id", "type": str, "default": None},
     ]
 ).generate_model()
 
diff --git a/modules/devices.py b/modules/devices.py
index ea1f712f..c956207f 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -23,6 +23,23 @@ def has_mps() -> bool:
         return mac_specific.has_mps
 
 
+def cuda_no_autocast(device_id=None) -> bool:
+    if device_id is None:
+        device_id = get_cuda_device_id()
+    return (
+        torch.cuda.get_device_capability(device_id) == (7, 5)
+        and torch.cuda.get_device_name(device_id).startswith("NVIDIA GeForce GTX 16")
+    )
+
+
+def get_cuda_device_id():
+    return (
+        int(shared.cmd_opts.device_id)
+        if shared.cmd_opts.device_id is not None and shared.cmd_opts.device_id.isdigit()
+        else 0
+    ) or torch.cuda.current_device()
+
+
 def get_cuda_device_string():
     if shared.cmd_opts.device_id is not None:
         return f"cuda:{shared.cmd_opts.device_id}"
@@ -73,8 +90,7 @@ def enable_tf32():
 
         # enabling benchmark option seems to enable a range of cards to do fp16 when they otherwise can't
         # see https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/4407
-        device_id = (int(shared.cmd_opts.device_id) if shared.cmd_opts.device_id is not None and shared.cmd_opts.device_id.isdigit() else 0) or torch.cuda.current_device()
-        if torch.cuda.get_device_capability(device_id) == (7, 5) and torch.cuda.get_device_name(device_id).startswith("NVIDIA GeForce GTX 16"):
+        if cuda_no_autocast():
             torch.backends.cudnn.benchmark = True
 
         torch.backends.cuda.matmul.allow_tf32 = True
@@ -84,6 +100,7 @@ def enable_tf32():
 errors.run(enable_tf32, "Enabling TF32")
 
 cpu: torch.device = torch.device("cpu")
+fp8: bool = False
 device: torch.device = None
 device_interrogate: torch.device = None
 device_gfpgan: torch.device = None
@@ -104,12 +121,51 @@ def cond_cast_float(input):
 
 
 nv_rng = None
+patch_module_list = [
+    torch.nn.Linear,
+    torch.nn.Conv2d,
+    torch.nn.MultiheadAttention,
+    torch.nn.GroupNorm,
+    torch.nn.LayerNorm,
+]
+
+
+def manual_cast_forward(self, *args, **kwargs):
+    org_dtype = next(self.parameters()).dtype
+    self.to(dtype)
+    args = [arg.to(dtype) if isinstance(arg, torch.Tensor) else arg for arg in args]
+    kwargs = {k: v.to(dtype) if isinstance(v, torch.Tensor) else v for k, v in kwargs.items()}
+    result = self.org_forward(*args, **kwargs)
+    self.to(org_dtype)
+    return result
+
+
+@contextlib.contextmanager
+def manual_cast():
+    for module_type in patch_module_list:
+        org_forward = module_type.forward
+        module_type.forward = manual_cast_forward
+        module_type.org_forward = org_forward
+    try:
+        yield None
+    finally:
+        for module_type in patch_module_list:
+            module_type.forward = module_type.org_forward
 
 
 def autocast(disable=False):
     if disable:
         return contextlib.nullcontext()
 
+    if fp8 and device==cpu:
+        return torch.autocast("cpu", dtype=torch.bfloat16, enabled=True)
+
+    if fp8 and (dtype == torch.float32 or shared.cmd_opts.precision == "full" or cuda_no_autocast()):
+        return manual_cast()
+
+    if has_mps() and shared.cmd_opts.precision != "full":
+        return manual_cast()
+
     if dtype == torch.float32 or shared.cmd_opts.precision == "full":
         return contextlib.nullcontext()
 
diff --git a/modules/errors.py b/modules/errors.py
index eb234a83..48aa13a1 100644
--- a/modules/errors.py
+++ b/modules/errors.py
@@ -107,8 +107,8 @@ def check_versions():
     import torch
     import gradio
 
-    expected_torch_version = "2.0.0"
-    expected_xformers_version = "0.0.20"
+    expected_torch_version = "2.1.2"
+    expected_xformers_version = "0.0.23.post1"
     expected_gradio_version = "3.41.2"
 
     if version.parse(torch.__version__) < version.parse(expected_torch_version):
diff --git a/modules/generation_parameters_copypaste.py b/modules/generation_parameters_copypaste.py
index 4efe53e0..dbffe494 100644
--- a/modules/generation_parameters_copypaste.py
+++ b/modules/generation_parameters_copypaste.py
@@ -314,6 +314,12 @@ Steps: 20, Sampler: Euler a, CFG scale: 7, Seed: 965400086, Size: 512x512, Model
     if "VAE Decoder" not in res:
         res["VAE Decoder"] = "Full"
 
+    if "FP8 weight" not in res:
+        res["FP8 weight"] = "Disable"
+
+    if "Cache FP16 weight for LoRA" not in res and res["FP8 weight"] != "Disable":
+        res["Cache FP16 weight for LoRA"] = False
+
     skip = set(shared.opts.infotext_skip_pasting)
     res = {k: v for k, v in res.items() if k not in skip}
 
diff --git a/modules/images.py b/modules/images.py
index daf4eebe..16f9ae7c 100644
--- a/modules/images.py
+++ b/modules/images.py
@@ -791,3 +791,4 @@ def flatten(img, bgcolor):
         img = background
 
     return img.convert('RGB')
+
diff --git a/modules/initialize_util.py b/modules/initialize_util.py
index 2e9b6d89..b6767138 100644
--- a/modules/initialize_util.py
+++ b/modules/initialize_util.py
@@ -177,6 +177,8 @@ def configure_opts_onchange():
     shared.opts.onchange("temp_dir", ui_tempdir.on_tmpdir_changed)
     shared.opts.onchange("gradio_theme", shared.reload_gradio_theme)
     shared.opts.onchange("cross_attention_optimization", wrap_queued_call(lambda: sd_hijack.model_hijack.redo_hijack(shared.sd_model)), call=False)
+    shared.opts.onchange("fp8_storage", wrap_queued_call(lambda: sd_models.reload_model_weights()), call=False)
+    shared.opts.onchange("cache_fp16_weight", wrap_queued_call(lambda: sd_models.reload_model_weights(forced_reload=True)), call=False)
     startup_timer.record("opts onchange")
 
 
diff --git a/modules/launch_utils.py b/modules/launch_utils.py
index 29506f24..dabef0f5 100644
--- a/modules/launch_utils.py
+++ b/modules/launch_utils.py
@@ -314,8 +314,8 @@ def requirements_met(requirements_file):
 
 
 def prepare_environment():
-    torch_index_url = os.environ.get('TORCH_INDEX_URL', "https://download.pytorch.org/whl/cu118")
-    torch_command = os.environ.get('TORCH_COMMAND', f"pip install torch==2.0.1 torchvision==0.15.2 --extra-index-url {torch_index_url}")
+    torch_index_url = os.environ.get('TORCH_INDEX_URL', "https://download.pytorch.org/whl/cu121")
+    torch_command = os.environ.get('TORCH_COMMAND', f"pip install torch==2.1.2 torchvision==0.16.2 --extra-index-url {torch_index_url}")
     if args.use_ipex:
         if platform.system() == "Windows":
             # The "Nuullll/intel-extension-for-pytorch" wheels were built from IPEX source for Intel Arc GPU: https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main
@@ -338,7 +338,7 @@ def prepare_environment():
             torch_command = os.environ.get('TORCH_COMMAND', f"pip install torch==2.0.0a0 intel-extension-for-pytorch==2.0.110+gitba7f6c1 --extra-index-url {torch_index_url}")
     requirements_file = os.environ.get('REQS_FILE', "requirements_versions.txt")
 
-    xformers_package = os.environ.get('XFORMERS_PACKAGE', 'xformers==0.0.20')
+    xformers_package = os.environ.get('XFORMERS_PACKAGE', 'xformers==0.0.23.post1')
     clip_package = os.environ.get('CLIP_PACKAGE', "https://github.com/openai/CLIP/archive/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1.zip")
     openclip_package = os.environ.get('OPENCLIP_PACKAGE', "https://github.com/mlfoundations/open_clip/archive/bb6e834e9c70d9c27d0dc3ecedeebeaeb1ffad6b.zip")
 
diff --git a/modules/processing.py b/modules/processing.py
index 6f01c95f..dd97b4ee 100644
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -62,18 +62,22 @@ def apply_color_correction(correction, original_image):
     return image.convert('RGB')
 
 
-def apply_overlay(image, paste_loc, index, overlays):
-    if overlays is None or index >= len(overlays):
-        return image
+def uncrop(image, dest_size, paste_loc):
+    x, y, w, h = paste_loc
+    base_image = Image.new('RGBA', dest_size)
+    image = images.resize_image(1, image, w, h)
+    base_image.paste(image, (x, y))
+    image = base_image
+
+    return image
 
-    overlay = overlays[index]
+
+def apply_overlay(image, paste_loc, overlay):
+    if overlay is None:
+        return image
 
     if paste_loc is not None:
-        x, y, w, h = paste_loc
-        base_image = Image.new('RGBA', (overlay.width, overlay.height))
-        image = images.resize_image(1, image, w, h)
-        base_image.paste(image, (x, y))
-        image = base_image
+        image = uncrop(image, (overlay.width, overlay.height), paste_loc)
 
     image = image.convert('RGBA')
     image.alpha_composite(overlay)
@@ -81,9 +85,12 @@ def apply_overlay(image, paste_loc, index, overlays):
 
     return image
 
-def create_binary_mask(image):
+def create_binary_mask(image, round=True):
     if image.mode == 'RGBA' and image.getextrema()[-1] != (255, 255):
-        image = image.split()[-1].convert("L").point(lambda x: 255 if x > 128 else 0)
+        if round:
+            image = image.split()[-1].convert("L").point(lambda x: 255 if x > 128 else 0)
+        else:
+            image = image.split()[-1].convert("L")
     else:
         image = image.convert('L')
     return image
@@ -308,7 +315,7 @@ class StableDiffusionProcessing:
             c_adm = torch.cat((c_adm, noise_level_emb), 1)
         return c_adm
 
-    def inpainting_image_conditioning(self, source_image, latent_image, image_mask=None):
+    def inpainting_image_conditioning(self, source_image, latent_image, image_mask=None, round_image_mask=True):
         self.is_using_inpainting_conditioning = True
 
         # Handle the different mask inputs
@@ -320,8 +327,10 @@ class StableDiffusionProcessing:
                 conditioning_mask = conditioning_mask.astype(np.float32) / 255.0
                 conditioning_mask = torch.from_numpy(conditioning_mask[None, None])
 
-                # Inpainting model uses a discretized mask as input, so we round to either 1.0 or 0.0
-                conditioning_mask = torch.round(conditioning_mask)
+                if round_image_mask:
+                    # Caller is requesting a discretized mask as input, so we round to either 1.0 or 0.0
+                    conditioning_mask = torch.round(conditioning_mask)
+
         else:
             conditioning_mask = source_image.new_ones(1, 1, *source_image.shape[-2:])
 
@@ -345,7 +354,7 @@ class StableDiffusionProcessing:
 
         return image_conditioning
 
-    def img2img_image_conditioning(self, source_image, latent_image, image_mask=None):
+    def img2img_image_conditioning(self, source_image, latent_image, image_mask=None, round_image_mask=True):
         source_image = devices.cond_cast_float(source_image)
 
         # HACK: Using introspection as the Depth2Image model doesn't appear to uniquely
@@ -357,7 +366,7 @@ class StableDiffusionProcessing:
             return self.edit_image_conditioning(source_image)
 
         if self.sampler.conditioning_key in {'hybrid', 'concat'}:
-            return self.inpainting_image_conditioning(source_image, latent_image, image_mask=image_mask)
+            return self.inpainting_image_conditioning(source_image, latent_image, image_mask=image_mask, round_image_mask=round_image_mask)
 
         if self.sampler.conditioning_key == "crossattn-adm":
             return self.unclip_image_conditioning(source_image)
@@ -679,6 +688,8 @@ def create_infotext(p, all_prompts, all_seeds, all_subseeds, comments=None, iter
         "Size": f"{p.width}x{p.height}",
         "Model hash": p.sd_model_hash if opts.add_model_hash_to_info else None,
         "Model": p.sd_model_name if opts.add_model_name_to_info else None,
+        "FP8 weight": opts.fp8_storage if devices.fp8 else None,
+        "Cache FP16 weight for LoRA": opts.cache_fp16_weight if devices.fp8 else None,
         "VAE hash": p.sd_vae_hash if opts.add_vae_hash_to_info else None,
         "VAE": p.sd_vae_name if opts.add_vae_name_to_info else None,
         "Variation seed": (None if p.subseed_strength == 0 else (p.all_subseeds[0] if use_main_prompt else all_subseeds[index])),
@@ -867,6 +878,11 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
             with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast():
                 samples_ddim = p.sample(conditioning=p.c, unconditional_conditioning=p.uc, seeds=p.seeds, subseeds=p.subseeds, subseed_strength=p.subseed_strength, prompts=p.prompts)
 
+            if p.scripts is not None:
+                ps = scripts.PostSampleArgs(samples_ddim)
+                p.scripts.post_sample(p, ps)
+                samples_ddim = ps.samples
+
             if getattr(samples_ddim, 'already_decoded', False):
                 x_samples_ddim = samples_ddim
             else:
@@ -922,13 +938,31 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
                     pp = scripts.PostprocessImageArgs(image)
                     p.scripts.postprocess_image(p, pp)
                     image = pp.image
+
+                mask_for_overlay = getattr(p, "mask_for_overlay", None)
+                overlay_image = p.overlay_images[i] if getattr(p, "overlay_images", None) is not None and i < len(p.overlay_images) else None
+
+                if p.scripts is not None:
+                    ppmo = scripts.PostProcessMaskOverlayArgs(i, mask_for_overlay, overlay_image)
+                    p.scripts.postprocess_maskoverlay(p, ppmo)
+                    mask_for_overlay, overlay_image = ppmo.mask_for_overlay, ppmo.overlay_image
+
                 if p.color_corrections is not None and i < len(p.color_corrections):
                     if save_samples and opts.save_images_before_color_correction:
-                        image_without_cc = apply_overlay(image, p.paste_to, i, p.overlay_images)
+                        image_without_cc = apply_overlay(image, p.paste_to, overlay_image)
                         images.save_image(image_without_cc, p.outpath_samples, "", p.seeds[i], p.prompts[i], opts.samples_format, info=infotext(i), p=p, suffix="-before-color-correction")
                     image = apply_color_correction(p.color_corrections[i], image)
 
-                image = apply_overlay(image, p.paste_to, i, p.overlay_images)
+                # If the intention is to show the output from the model
+                # that is being composited over the original image,
+                # we need to keep the original image around
+                # and use it in the composite step.
+                original_denoised_image = image.copy()
+
+                if p.paste_to is not None:
+                    original_denoised_image = uncrop(original_denoised_image, (overlay_image.width, overlay_image.height), p.paste_to)
+
+                image = apply_overlay(image, p.paste_to, overlay_image)
 
                 if save_samples:
                     images.save_image(image, p.outpath_samples, "", p.seeds[i], p.prompts[i], opts.samples_format, info=infotext(i), p=p)
@@ -938,16 +972,17 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
                 if opts.enable_pnginfo:
                     image.info["parameters"] = text
                 output_images.append(image)
-                if hasattr(p, 'mask_for_overlay') and p.mask_for_overlay:
+
+                if mask_for_overlay is not None:
                     if opts.return_mask or opts.save_mask:
-                        image_mask = p.mask_for_overlay.convert('RGB')
+                        image_mask = mask_for_overlay.convert('RGB')
                         if save_samples and opts.save_mask:
                             images.save_image(image_mask, p.outpath_samples, "", p.seeds[i], p.prompts[i], opts.samples_format, info=infotext(i), p=p, suffix="-mask")
                         if opts.return_mask:
                             output_images.append(image_mask)
 
                     if opts.return_mask_composite or opts.save_mask_composite:
-                        image_mask_composite = Image.composite(image.convert('RGBA').convert('RGBa'), Image.new('RGBa', image.size), images.resize_image(2, p.mask_for_overlay, image.width, image.height).convert('L')).convert('RGBA')
+                        image_mask_composite = Image.composite(original_denoised_image.convert('RGBA').convert('RGBa'), Image.new('RGBa', image.size), images.resize_image(2, mask_for_overlay, image.width, image.height).convert('L')).convert('RGBA')
                         if save_samples and opts.save_mask_composite:
                             images.save_image(image_mask_composite, p.outpath_samples, "", p.seeds[i], p.prompts[i], opts.samples_format, info=infotext(i), p=p, suffix="-mask-composite")
                         if opts.return_mask_composite:
@@ -1025,6 +1060,7 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing):
     hr_sampler_name: str = None
     hr_prompt: str = ''
     hr_negative_prompt: str = ''
+    force_task_id: str = None
 
     cached_hr_uc = [None, None]
     cached_hr_c = [None, None]
@@ -1351,12 +1387,14 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):
     mask_blur_x: int = 4
     mask_blur_y: int = 4
     mask_blur: int = None
+    mask_round: bool = True
     inpainting_fill: int = 0
     inpaint_full_res: bool = True
     inpaint_full_res_padding: int = 0
     inpainting_mask_invert: int = 0
     initial_noise_multiplier: float = None
     latent_mask: Image = None
+    force_task_id: str = None
 
     image_mask: Any = field(default=None, init=False)
 
@@ -1396,7 +1434,7 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):
         if image_mask is not None:
             # image_mask is passed in as RGBA by Gradio to support alpha masks,
             # but we still want to support binary masks.
-            image_mask = create_binary_mask(image_mask)
+            image_mask = create_binary_mask(image_mask, round=self.mask_round)
 
             if self.inpainting_mask_invert:
                 image_mask = ImageOps.invert(image_mask)
@@ -1503,7 +1541,8 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):
             latmask = init_mask.convert('RGB').resize((self.init_latent.shape[3], self.init_latent.shape[2]))
             latmask = np.moveaxis(np.array(latmask, dtype=np.float32), 2, 0) / 255
             latmask = latmask[0]
-            latmask = np.around(latmask)
+            if self.mask_round:
+                latmask = np.around(latmask)
             latmask = np.tile(latmask[None], (4, 1, 1))
 
             self.mask = torch.asarray(1.0 - latmask).to(shared.device).type(self.sd_model.dtype)
@@ -1515,7 +1554,7 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):
             elif self.inpainting_fill == 3:
                 self.init_latent = self.init_latent * self.mask
 
-        self.image_conditioning = self.img2img_image_conditioning(image * 2 - 1, self.init_latent, image_mask)
+        self.image_conditioning = self.img2img_image_conditioning(image * 2 - 1, self.init_latent, image_mask, self.mask_round)
 
     def sample(self, conditioning, unconditional_conditioning, seeds, subseeds, subseed_strength, prompts):
         x = self.rng.next()
@@ -1527,7 +1566,14 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):
         samples = self.sampler.sample_img2img(self, self.init_latent, x, conditioning, unconditional_conditioning, image_conditioning=self.image_conditioning)
 
         if self.mask is not None:
-            samples = samples * self.nmask + self.init_latent * self.mask
+            blended_samples = samples * self.nmask + self.init_latent * self.mask
+
+            if self.scripts is not None:
+                mba = scripts.MaskBlendArgs(samples, self.nmask, self.init_latent, self.mask, blended_samples)
+                self.scripts.on_mask_blend(self, mba)
+                blended_samples = mba.blended_latent
+
+            samples = blended_samples
 
         del x
         devices.torch_gc()
diff --git a/modules/progress.py b/modules/progress.py
index 69921de7..85255e82 100644
--- a/modules/progress.py
+++ b/modules/progress.py
@@ -8,10 +8,13 @@ from pydantic import BaseModel, Field
 from modules.shared import opts
 
 import modules.shared as shared
-
+from collections import OrderedDict
+import string
+import random
+from typing import List
 
 current_task = None
-pending_tasks = {}
+pending_tasks = OrderedDict()
 finished_tasks = []
 recorded_results = []
 recorded_results_limit = 2
@@ -34,6 +37,11 @@ def finish_task(id_task):
     if len(finished_tasks) > 16:
         finished_tasks.pop(0)
 
+def create_task_id(task_type):
+    N = 7
+    res = ''.join(random.choices(string.ascii_uppercase +
+    string.digits, k=N))
+    return f"task({task_type}-{res})"
 
 def record_results(id_task, res):
     recorded_results.append((id_task, res))
@@ -44,6 +52,9 @@ def record_results(id_task, res):
 def add_task_to_queue(id_job):
     pending_tasks[id_job] = time.time()
 
+class PendingTasksResponse(BaseModel):
+    size: int = Field(title="Pending task size")
+    tasks: List[str] = Field(title="Pending task ids")
 
 class ProgressRequest(BaseModel):
     id_task: str = Field(default=None, title="Task ID", description="id of the task to get progress for")
@@ -63,9 +74,16 @@ class ProgressResponse(BaseModel):
 
 
 def setup_progress_api(app):
+    app.add_api_route("/internal/pending-tasks", get_pending_tasks, methods=["GET"])
     return app.add_api_route("/internal/progress", progressapi, methods=["POST"], response_model=ProgressResponse)
 
 
+def get_pending_tasks():
+    pending_tasks_ids = list(pending_tasks)
+    pending_len = len(pending_tasks_ids)
+    return PendingTasksResponse(size=pending_len, tasks=pending_tasks_ids)
+
+
 def progressapi(req: ProgressRequest):
     active = req.id_task == current_task
     queued = req.id_task in pending_tasks
diff --git a/modules/scripts.py b/modules/scripts.py
index 7f9454eb..b6fcf96e 100644
--- a/modules/scripts.py
+++ b/modules/scripts.py
@@ -11,11 +11,31 @@ from modules import shared, paths, script_callbacks, extensions, script_loading,
 
 AlwaysVisible = object()
 
+class MaskBlendArgs:
+    def __init__(self, current_latent, nmask, init_latent, mask, blended_latent, denoiser=None, sigma=None):
+        self.current_latent = current_latent
+        self.nmask = nmask
+        self.init_latent = init_latent
+        self.mask = mask
+        self.blended_latent = blended_latent
+
+        self.denoiser = denoiser
+        self.is_final_blend = denoiser is None
+        self.sigma = sigma
+
+class PostSampleArgs:
+    def __init__(self, samples):
+        self.samples = samples
 
 class PostprocessImageArgs:
     def __init__(self, image):
         self.image = image
 
+class PostProcessMaskOverlayArgs:
+    def __init__(self, index, mask_for_overlay, overlay_image):
+        self.index = index
+        self.mask_for_overlay = mask_for_overlay
+        self.overlay_image = overlay_image
 
 class PostprocessBatchListArgs:
     def __init__(self, images):
@@ -206,6 +226,25 @@ class Script:
 
         pass
 
+    def on_mask_blend(self, p, mba: MaskBlendArgs, *args):
+        """
+        Called in inpainting mode when the original content is blended with the inpainted content.
+        This is called at every step in the denoising process and once at the end.
+        If is_final_blend is true, this is called for the final blending stage.
+        Otherwise, denoiser and sigma are defined and may be used to inform the procedure.
+        """
+
+        pass
+
+    def post_sample(self, p, ps: PostSampleArgs, *args):
+        """
+        Called after the samples have been generated,
+        but before they have been decoded by the VAE, if applicable.
+        Check getattr(samples, 'already_decoded', False) to test if the images are decoded.
+        """
+
+        pass
+
     def postprocess_image(self, p, pp: PostprocessImageArgs, *args):
         """
         Called for every image after it has been generated.
@@ -213,6 +252,13 @@ class Script:
 
         pass
 
+    def postprocess_maskoverlay(self, p, ppmo: PostProcessMaskOverlayArgs, *args):
+        """
+        Called for every image after it has been generated.
+        """
+
+        pass
+
     def postprocess(self, p, processed, *args):
         """
         This function is called after processing ends for AlwaysVisible scripts.
@@ -767,6 +813,22 @@ class ScriptRunner:
             except Exception:
                 errors.report(f"Error running postprocess_batch_list: {script.filename}", exc_info=True)
 
+    def post_sample(self, p, ps: PostSampleArgs):
+        for script in self.alwayson_scripts:
+            try:
+                script_args = p.script_args[script.args_from:script.args_to]
+                script.post_sample(p, ps, *script_args)
+            except Exception:
+                errors.report(f"Error running post_sample: {script.filename}", exc_info=True)
+
+    def on_mask_blend(self, p, mba: MaskBlendArgs):
+        for script in self.alwayson_scripts:
+            try:
+                script_args = p.script_args[script.args_from:script.args_to]
+                script.on_mask_blend(p, mba, *script_args)
+            except Exception:
+                errors.report(f"Error running post_sample: {script.filename}", exc_info=True)
+
     def postprocess_image(self, p, pp: PostprocessImageArgs):
         for script in self.alwayson_scripts:
             try:
@@ -775,6 +837,14 @@ class ScriptRunner:
             except Exception:
                 errors.report(f"Error running postprocess_image: {script.filename}", exc_info=True)
 
+    def postprocess_maskoverlay(self, p, ppmo: PostProcessMaskOverlayArgs):
+        for script in self.alwayson_scripts:
+            try:
+                script_args = p.script_args[script.args_from:script.args_to]
+                script.postprocess_maskoverlay(p, ppmo, *script_args)
+            except Exception:
+                errors.report(f"Error running postprocess_image: {script.filename}", exc_info=True)
+
     def before_component(self, component, **kwargs):
         for callback, script in self.on_before_component_elem_id.get(kwargs.get("elem_id"), []):
             try:
diff --git a/modules/sd_disable_initialization.py b/modules/sd_disable_initialization.py
index 8863107a..273a7edd 100644
--- a/modules/sd_disable_initialization.py
+++ b/modules/sd_disable_initialization.py
@@ -215,7 +215,7 @@ class LoadStateDictOnMeta(ReplaceHelper):
             would be on the meta device.
             """
 
-            if state_dict == sd:
+            if state_dict is sd:
                 state_dict = {k: v.to(device="meta", dtype=v.dtype) for k, v in state_dict.items()}
 
             original(module, state_dict, strict=strict)
diff --git a/modules/sd_models.py b/modules/sd_models.py
index 9355f1e1..d0046f88 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -348,10 +348,28 @@ class SkipWritingToConfig:
         SkipWritingToConfig.skip = self.previous
 
 
+def check_fp8(model):
+    if model is None:
+        return None
+    if devices.get_optimal_device_name() == "mps":
+        enable_fp8 = False
+    elif shared.opts.fp8_storage == "Enable":
+        enable_fp8 = True
+    elif getattr(model, "is_sdxl", False) and shared.opts.fp8_storage == "Enable for SDXL":
+        enable_fp8 = True
+    else:
+        enable_fp8 = False
+    return enable_fp8
+
+
 def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer):
     sd_model_hash = checkpoint_info.calculate_shorthash()
     timer.record("calculate hash")
 
+    if devices.fp8:
+        # prevent model to load state dict in fp8
+        model.half()
+
     if not SkipWritingToConfig.skip:
         shared.opts.data["sd_model_checkpoint"] = checkpoint_info.title
 
@@ -404,6 +422,28 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
         devices.dtype_unet = torch.float16
         timer.record("apply half()")
 
+    for module in model.modules():
+        if hasattr(module, 'fp16_weight'):
+            del module.fp16_weight
+        if hasattr(module, 'fp16_bias'):
+            del module.fp16_bias
+
+    if check_fp8(model):
+        devices.fp8 = True
+        first_stage = model.first_stage_model
+        model.first_stage_model = None
+        for module in model.modules():
+            if isinstance(module, (torch.nn.Conv2d, torch.nn.Linear)):
+                if shared.opts.cache_fp16_weight:
+                    module.fp16_weight = module.weight.data.clone().cpu().half()
+                    if module.bias is not None:
+                        module.fp16_bias = module.bias.data.clone().cpu().half()
+                module.to(torch.float8_e4m3fn)
+        model.first_stage_model = first_stage
+        timer.record("apply fp8")
+    else:
+        devices.fp8 = False
+
     devices.unet_needs_upcast = shared.cmd_opts.upcast_sampling and devices.dtype == torch.float16 and devices.dtype_unet == torch.float16
 
     model.first_stage_model.to(devices.dtype_vae)
@@ -746,7 +786,7 @@ def reuse_model_from_already_loaded(sd_model, checkpoint_info, timer):
         return None
 
 
-def reload_model_weights(sd_model=None, info=None):
+def reload_model_weights(sd_model=None, info=None, forced_reload=False):
     checkpoint_info = info or select_checkpoint()
 
     timer = Timer()
@@ -758,11 +798,14 @@ def reload_model_weights(sd_model=None, info=None):
         current_checkpoint_info = None
     else:
         current_checkpoint_info = sd_model.sd_checkpoint_info
-        if sd_model.sd_model_checkpoint == checkpoint_info.filename:
+        if check_fp8(sd_model) != devices.fp8:
+            # load from state dict again to prevent extra numerical errors
+            forced_reload = True
+        elif sd_model.sd_model_checkpoint == checkpoint_info.filename and not forced_reload:
             return sd_model
 
     sd_model = reuse_model_from_already_loaded(sd_model, checkpoint_info, timer)
-    if sd_model is not None and sd_model.sd_checkpoint_info.filename == checkpoint_info.filename:
+    if not forced_reload and sd_model is not None and sd_model.sd_checkpoint_info.filename == checkpoint_info.filename:
         return sd_model
 
     if sd_model is not None:
diff --git a/modules/sd_models_xl.py b/modules/sd_models_xl.py
index 01123321..11259a36 100644
--- a/modules/sd_models_xl.py
+++ b/modules/sd_models_xl.py
@@ -93,7 +93,7 @@ def extend_sdxl(model):
     model.parameterization = "v" if isinstance(model.denoiser.scaling, sgm.modules.diffusionmodules.denoiser_scaling.VScaling) else "eps"
 
     discretization = sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization()
-    model.alphas_cumprod = torch.asarray(discretization.alphas_cumprod, device=devices.device, dtype=dtype)
+    model.alphas_cumprod = torch.asarray(discretization.alphas_cumprod, device=devices.device, dtype=torch.float32)
 
     model.conditioner.wrapped = torch.nn.Module()
 
diff --git a/modules/sd_samplers_cfg_denoiser.py b/modules/sd_samplers_cfg_denoiser.py
index b8101d38..eb9d5daf 100644
--- a/modules/sd_samplers_cfg_denoiser.py
+++ b/modules/sd_samplers_cfg_denoiser.py
@@ -56,6 +56,9 @@ class CFGDenoiser(torch.nn.Module):
         self.sampler = sampler
         self.model_wrap = None
         self.p = None
+
+        # NOTE: masking before denoising can cause the original latents to be oversmoothed
+        # as the original latents do not have noise
         self.mask_before_denoising = False
 
     @property
@@ -105,8 +108,21 @@ class CFGDenoiser(torch.nn.Module):
 
         assert not is_edit_model or all(len(conds) == 1 for conds in conds_list), "AND is not supported for InstructPix2Pix checkpoint (unless using Image CFG scale = 1.0)"
 
+        # If we use masks, blending between the denoised and original latent images occurs here.
+        def apply_blend(current_latent):
+            blended_latent = current_latent * self.nmask + self.init_latent * self.mask
+
+            if self.p.scripts is not None:
+                from modules import scripts
+                mba = scripts.MaskBlendArgs(current_latent, self.nmask, self.init_latent, self.mask, blended_latent, denoiser=self, sigma=sigma)
+                self.p.scripts.on_mask_blend(self.p, mba)
+                blended_latent = mba.blended_latent
+
+            return blended_latent
+
+        # Blend in the original latents (before)
         if self.mask_before_denoising and self.mask is not None:
-            x = self.init_latent * self.mask + self.nmask * x
+            x = apply_blend(x)
 
         batch_size = len(conds_list)
         repeats = [len(conds_list[i]) for i in range(batch_size)]
@@ -207,8 +223,9 @@ class CFGDenoiser(torch.nn.Module):
         else:
             denoised = self.combine_denoised(x_out, conds_list, uncond, cond_scale)
 
+        # Blend in the original latents (after)
         if not self.mask_before_denoising and self.mask is not None:
-            denoised = self.init_latent * self.mask + self.nmask * denoised
+            denoised = apply_blend(denoised)
 
         self.sampler.last_latent = self.get_pred_x0(torch.cat([x_in[i:i + 1] for i in denoised_image_indexes]), torch.cat([x_out[i:i + 1] for i in denoised_image_indexes]), sigma)
 
diff --git a/modules/shared_options.py b/modules/shared_options.py
index e5de0d01..fa542ba8 100644
--- a/modules/shared_options.py
+++ b/modules/shared_options.py
@@ -206,6 +206,8 @@ options_templates.update(options_section(('optimizations', "Optimizations", "sd"
     "pad_cond_uncond": OptionInfo(False, "Pad prompt/negative prompt to be same length", infotext='Pad conds').info("improves performance when prompt and negative prompt have different lengths; changes seeds"),
     "persistent_cond_cache": OptionInfo(True, "Persistent cond cache").info("do not recalculate conds from prompts if prompts have not changed since previous calculation"),
     "batch_cond_uncond": OptionInfo(True, "Batch cond/uncond").info("do both conditional and unconditional denoising in one batch; uses a bit more VRAM during sampling, but improves speed; previously this was controlled by --always-batch-cond-uncond comandline argument"),
+    "fp8_storage": OptionInfo("Disable", "FP8 weight", gr.Radio, {"choices": ["Disable", "Enable for SDXL", "Enable"]}).info("Use FP8 to store Linear/Conv layers' weight. Require pytorch>=2.1.0."),
+    "cache_fp16_weight": OptionInfo(False, "Cache FP16 weight for LoRA").info("Cache fp16 weight when enabling FP8, will increase the quality of LoRA. Use more system ram."),
 }))
 
 options_templates.update(options_section(('compatibility', "Compatibility", "sd"), {
@@ -256,6 +258,7 @@ options_templates.update(options_section(('ui_prompt_editing', "Prompt editing",
     "keyedit_precision_extra": OptionInfo(0.05, "Precision for <extra networks:0.9> when editing the prompt with Ctrl+up/down", gr.Slider, {"minimum": 0.01, "maximum": 0.2, "step": 0.001}),
     "keyedit_delimiters": OptionInfo(r".,\/!?%^*;:{}=`~() ", "Word delimiters when editing the prompt with Ctrl+up/down"),
     "keyedit_delimiters_whitespace": OptionInfo(["Tab", "Carriage Return", "Line Feed"], "Ctrl+up/down whitespace delimiters", gr.CheckboxGroup, lambda: {"choices": ["Tab", "Carriage Return", "Line Feed"]}),
+    "keyedit_move": OptionInfo(True, "Alt+left/right moves prompt elements"),
     "disable_token_counters": OptionInfo(False, "Disable prompt token counters").needs_reload_ui(),
 }))
 
@@ -330,6 +333,7 @@ options_templates.update(options_section(('ui', "Live previews", "ui"), {
     "live_preview_content": OptionInfo("Prompt", "Live preview subject", gr.Radio, {"choices": ["Combined", "Prompt", "Negative prompt"]}),
     "live_preview_refresh_period": OptionInfo(1000, "Progressbar and preview update period").info("in milliseconds"),
     "live_preview_fast_interrupt": OptionInfo(False, "Return image with chosen live preview method on interrupt").info("makes interrupts faster"),
+    "js_live_preview_in_modal_lightbox": OptionInfo(False, "Show Live preview in full page image viewer"),
 }))
 
 options_templates.update(options_section(('sampler-params', "Sampler parameters", "sd"), {
diff --git a/modules/styles.py b/modules/styles.py
index 7fb6c2e1..81d9800d 100644
--- a/modules/styles.py
+++ b/modules/styles.py
@@ -98,10 +98,8 @@ class StyleDatabase:
         self.path = path
 
         folder, file = os.path.split(self.path)
-        self.default_file = file.split("*")[0] + ".csv"
-        if self.default_file == ".csv":
-            self.default_file = "styles.csv"
-        self.default_path = os.path.join(folder, self.default_file)
+        filename, _, ext = file.partition('*')
+        self.default_path = os.path.join(folder, filename + ext)
 
         self.prompt_fields = [field for field in PromptStyle._fields if field != "path"]
 
@@ -155,10 +153,8 @@ class StyleDatabase:
                     row["name"], prompt, negative_prompt, path
                 )
 
-    def get_style_paths(self) -> list():
-        """
-        Returns a list of all distinct paths, including the default path, of
-        files that styles are loaded from."""
+    def get_style_paths(self) -> set:
+        """Returns a set of all distinct paths of files that styles are loaded from."""
         # Update any styles without a path to the default path
         for style in list(self.styles.values()):
             if not style.path:
@@ -172,9 +168,9 @@ class StyleDatabase:
                 style_paths.add(style.path)
 
         # Remove any paths for styles that are just list dividers
-        style_paths.remove("do_not_save")
+        style_paths.discard("do_not_save")
 
-        return list(style_paths)
+        return style_paths
 
     def get_style_prompts(self, styles):
         return [self.styles.get(x, self.no_style).prompt for x in styles]
@@ -196,20 +192,7 @@ class StyleDatabase:
         # The path argument is deprecated, but kept for backwards compatibility
         _ = path
 
-        # Update any styles without a path to the default path
-        for style in list(self.styles.values()):
-            if not style.path:
-                self.styles[style.name] = style._replace(path=self.default_path)
-
-        # Create a list of all distinct paths, including the default path
-        style_paths = set()
-        style_paths.add(self.default_path)
-        for _, style in self.styles.items():
-            if style.path:
-                style_paths.add(style.path)
-
-        # Remove any paths for styles that are just list dividers
-        style_paths.remove("do_not_save")
+        style_paths = self.get_style_paths()
 
         csv_names = [os.path.split(path)[1].lower() for path in style_paths]
 
diff --git a/modules/ui_toprow.py b/modules/ui_toprow.py
index 88838f97..9caf8faa 100644
--- a/modules/ui_toprow.py
+++ b/modules/ui_toprow.py
@@ -79,11 +79,11 @@ class Toprow:
     def create_prompts(self):
         with gr.Column(elem_id=f"{self.id_part}_prompt_container", elem_classes=["prompt-container-compact"] if self.is_compact else [], scale=6):
             with gr.Row(elem_id=f"{self.id_part}_prompt_row", elem_classes=["prompt-row"]):
-                self.prompt = gr.Textbox(label="Prompt", elem_id=f"{self.id_part}_prompt", show_label=False, lines=3, placeholder="Prompt (press Ctrl+Enter or Alt+Enter to generate)", elem_classes=["prompt"])
+                self.prompt = gr.Textbox(label="Prompt", elem_id=f"{self.id_part}_prompt", show_label=False, lines=3, placeholder="Prompt\n(Press Ctrl+Enter to generate, Alt+Enter to skip, Esc to interrupt)", elem_classes=["prompt"])
                 self.prompt_img = gr.File(label="", elem_id=f"{self.id_part}_prompt_image", file_count="single", type="binary", visible=False)
 
             with gr.Row(elem_id=f"{self.id_part}_neg_prompt_row", elem_classes=["prompt-row"]):
-                self.negative_prompt = gr.Textbox(label="Negative prompt", elem_id=f"{self.id_part}_neg_prompt", show_label=False, lines=3, placeholder="Negative prompt (press Ctrl+Enter or Alt+Enter to generate)", elem_classes=["prompt"])
+                self.negative_prompt = gr.Textbox(label="Negative prompt", elem_id=f"{self.id_part}_neg_prompt", show_label=False, lines=3, placeholder="Negative prompt\n(Press Ctrl+Enter to generate, Alt+Enter to skip, Esc to interrupt)", elem_classes=["prompt"])
 
         self.prompt_img.change(
             fn=modules.images.image_data,
diff --git a/modules/xpu_specific.py b/modules/xpu_specific.py
index d933c790..d8da94a0 100644
--- a/modules/xpu_specific.py
+++ b/modules/xpu_specific.py
@@ -48,3 +48,12 @@ if has_xpu:
     CondFunc('torch.nn.modules.conv.Conv2d.forward',
         lambda orig_func, self, input: orig_func(self, input.to(self.weight.data.dtype)),
         lambda orig_func, self, input: input.dtype != self.weight.data.dtype)
+    CondFunc('torch.bmm',
+        lambda orig_func, input, mat2, out=None: orig_func(input.to(mat2.dtype), mat2, out=out),
+        lambda orig_func, input, mat2, out=None: input.dtype != mat2.dtype)
+    CondFunc('torch.cat',
+        lambda orig_func, tensors, dim=0, out=None: orig_func([t.to(tensors[0].dtype) for t in tensors], dim=dim, out=out),
+        lambda orig_func, tensors, dim=0, out=None: not all(t.dtype == tensors[0].dtype for t in tensors))
+    CondFunc('torch.nn.functional.scaled_dot_product_attention',
+        lambda orig_func, query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False: orig_func(query, key.to(query.dtype), value.to(query.dtype), attn_mask, dropout_p, is_causal),
+        lambda orig_func, query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False: query.dtype != key.dtype or query.dtype != value.dtype)