Skip to content

Commit

Permalink
Merge pull request #4153 from dhiltgen/gpu_verbose_response
Browse files Browse the repository at this point in the history
Add GPU usage
  • Loading branch information
dhiltgen committed May 8, 2024
2 parents 8a516ac + bee2f4a commit ee49844
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 20 deletions.
2 changes: 2 additions & 0 deletions format/bytes.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ func HumanBytes(b int64) string {

func HumanBytes2(b uint64) string {
switch {
case b >= GibiByte:
return fmt.Sprintf("%.1f GiB", float64(b)/GibiByte)
case b >= MebiByte:
return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
case b >= KibiByte:
Expand Down
24 changes: 12 additions & 12 deletions llm/memory.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
// Split up the GPUs by type and try them
for _, gpus := range allGpus.ByLibrary() {
var layerCount int
layerCount, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
if opts.NumGPU < 0 {
if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
return true, estimatedVRAM
Expand All @@ -39,12 +39,9 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
return false, estimatedVRAM
}

// Given a model and one or more GPU targets, predict how many layers and bytes we can load
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
// The GPUs provided must all be the same Library
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64) {
if gpus[0].Library == "cpu" {
return 0, 0
}
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
var memoryAvailable uint64
for _, info := range gpus {
memoryAvailable += info.FreeMemory
Expand Down Expand Up @@ -93,11 +90,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()

if memoryRequiredPartial > memoryAvailable {
slog.Debug("insufficient VRAM to load any model layers")
return 0, 0
}

var memoryLayerOutput uint64
if layer, ok := layers["output_norm"]; ok {
memoryLayerOutput += layer.size()
Expand Down Expand Up @@ -181,5 +173,13 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
),
),
)
return layerCount, uint64(memoryRequiredPartial)
if gpus[0].Library == "cpu" {
return 0, 0, memoryRequiredTotal
}
if memoryRequiredPartial > memoryAvailable {
slog.Debug("insufficient VRAM to load any model layers")
return 0, 0, memoryRequiredTotal
}

return layerCount, memoryRequiredPartial, memoryRequiredTotal
}
34 changes: 26 additions & 8 deletions llm/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ type llmServer struct {
options api.Options

// TODO - this should be broken down by GPU
estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
estimatedTotal uint64 // Total size of model
totalLayers uint64
gpuCount int

sem *semaphore.Weighted
}
Expand Down Expand Up @@ -83,12 +86,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr

cpuRunner := ""
var estimatedVRAM uint64
var estimatedTotal uint64
var systemMemory uint64
gpuCount := len(gpus)
if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {

// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner

cpuRunner = serverForCpu()
gpuCount = 0
} else {
if gpus[0].Library == "metal" {
memInfo, err := gpu.GetCPUMem()
Expand All @@ -100,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}
}
var layers int
layers, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)

if gpus[0].Library == "metal" && estimatedVRAM > systemMemory {
// disable partial offloading when model is greater than total system memory as this
Expand Down Expand Up @@ -133,6 +139,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} else {
slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
servers = []string{demandLib}
if strings.HasPrefix(demandLib, "cpu") {
// Omit the GPU flag to silence the warning
opts.NumGPU = -1
}
}
}

Expand Down Expand Up @@ -214,6 +224,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
continue
}

if strings.HasPrefix(servers[i], "cpu") {
// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
gpuCount = 0
}

// Find an availableServers port, retry on each iterration in case the failure was a port conflict race
port := 0
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
Expand Down Expand Up @@ -267,12 +282,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
}

s := &llmServer{
port: port,
cmd: exec.Command(server, finalParams...),
status: NewStatusWriter(os.Stderr),
options: opts,
estimatedVRAM: estimatedVRAM,
sem: semaphore.NewWeighted(int64(numParallel)),
port: port,
cmd: exec.Command(server, finalParams...),
status: NewStatusWriter(os.Stderr),
options: opts,
estimatedVRAM: estimatedVRAM,
estimatedTotal: estimatedTotal,
sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: ggml.KV().BlockCount() + 1,
gpuCount: gpuCount,
}

s.cmd.Env = os.Environ()
Expand Down

0 comments on commit ee49844

Please sign in to comment.