运行时模型优化 Runtime Model Optimizations

fal 的推理引擎绑定采用 torch 模块并应用所有相关的动态编译和量化技术,使其开箱即用,速度更快,而不会向用户泄露任何复杂性。

此 API 目前处于实验阶段,将来可能会发生变化。

示例用法:

py
import fal
import fal.toolkit
from fal.toolkit import Image
from pydantic import BaseModel, Field

class Input(BaseModel):
    prompt: str = Field(
        description="The prompt to generate an image from.",
        examples=[
            "A cinematic shot of a baby racoon wearing an intricate italian priest robe.",
        ],
    )

class Output(BaseModel):
    image: Image = Field(
        description="The generated image.",
    )

class FalModel(fal.App):
    machine_type = "GPU"
    requirements = [
        "accelerate",
        "transformers>=4.30.2",
        "diffusers>=0.26",
        "torch>=2.2.0",
    ]

    def setup(self) -> None:
        import torch
        from diffusers import AutoPipelineForText2Image

        # Load SDXL
        self.pipeline = AutoPipelineForText2Image.from_pretrained(
            "stabilityai/stable-diffusion-xl-base-1.0",
            torch_dtype=torch.float16,
            variant="fp16",
        )
        self.pipeline.to("cuda")

        # Apply fal's spatial optimizer to the pipeline.
        self.pipeline.unet = fal.toolkit.optimize(self.pipeline.unet)
        self.pipeline.vae = fal.toolkit.optimize(self.pipeline.vae)

        # Warm up the model.
        self.pipeline(
            prompt="a cat",
            num_inference_steps=30,
        )

    @fal.endpoint("/")
    def text_to_image(self, input: Input) -> Output:
        result = self.pipeline(
            prompt=input.prompt,
            num_inference_steps=30,
        )
        [image] = result.images
        return Output(image=Image.from_pil(image))