Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Delete Container Registry images left after Functions deployment #3439

Merged
merged 10 commits into from
Jun 21, 2021
1 change: 1 addition & 0 deletions src/api.js
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ var api = {
"FIREBASE_CLOUDLOGGING_URL",
"https://logging.googleapis.com"
),
containerRegistryDomain: utils.envOverride("CONTAINER_REGISTRY_DOMAIN", "gcr.io"),
appDistributionOrigin: utils.envOverride(
"FIREBASE_APP_DISTRIBUTION_URL",
"https://firebaseappdistribution.googleapis.com"
Expand Down
160 changes: 160 additions & 0 deletions src/deploy/functions/containerCleaner.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
// This code is very aggressive about running requests in parallel and does not use
// a task queue, because the quota limits for GCR.io are absurdly high. At the time
// of writing, we can make 50K requests per 10m.
// https://cloud.google.com/container-registry/quotas

import * as clc from "cli-color";

import { logger } from "../../logger";
import * as gcr from "../../gcp/containerregistry";
import * as backend from "./backend";
import * as utils from "../../utils";

// A flattening of container_registry_hosts and
// region_multiregion_map from regionconfig.borg
const SUBDOMAIN_MAPPING: Record<string, string> = {
"us-west2": "us",
"us-west3": "us",
"us-west4": "us",
"us-central1": "us",
"us-central2": "us",
"us-east1": "us",
"us-east4": "us",
"northamerica-northeast1": "us",
"southamerica-east1": "us",
"europe-west1": "eu",
"europe-west2": "eu",
"europe-west3": "eu",
"europe-west5": "eu",
"europe-west6": "eu",
"europe-central2": "eu",
"asia-east1": "asia",
"asia-east2": "asia",
"asia-northeast1": "asia",
"asia-northeast2": "asia",
"asia-northeast3": "asia",
"asia-south1": "asia",
"asia-southeast2": "asia",
"australia-southeast1": "asia",
};

export async function cleanupBuildImages(functions: backend.FunctionSpec[]): Promise<void> {
utils.logBullet(clc.bold.cyan("functions: ") + "cleaning up build files...");
const gcrCleaner = new ContainerRegistryCleaner();
try {
await Promise.all(functions.map((func) => gcrCleaner.cleanupFunction(func)));
} catch (err) {
logger.debug("Failed to delete container registry artifacts with error", err);
utils.logLabeledWarning(
"functions",
"Unhnandled error cleaning up build files. This could result in a small monthly bill if not corrected"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be great to include a console link to where you can clean these up, plus a list of the files that were not cleaned up.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. I've split the difference between what I think you're asking for and what I think I can reliably deliver (since I can't do a recursive LS to find all files that need to be deleted when I can't trust API calls to succeed).

I've made the code try to resume from as many errors as possible by catching and throwing at the end of each process. Since this can lead to multiple errors, I throw a random error but log all errors. When there are any errors, I print the top-level directory for GCF's images in that multi-region. I have a list format for 2 or more regions and an inline format for one region. I've manually tested a case where there was:

  1. No errors
  2. An error in one region
  3. An error in two regions on the same multi-region
  4. An error in multiple regions that are stored in multiple multi-regions

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo in unhandled

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

);
}

// TODO: clean up Artifact Registry images as well.
}

export class ContainerRegistryCleaner {
readonly helpers: Record<string, ContainerRegistryHelper> = {};

private helper(location: string): ContainerRegistryHelper {
const subdomain = SUBDOMAIN_MAPPING[location] || "us";
if (!this.helpers[subdomain]) {
this.helpers[subdomain] = new ContainerRegistryHelper(subdomain);
}
return this.helpers[subdomain];
}

// GCFv1 has the directory structure:
// gcf/
// +- <region>/
// +- <uuid>
// +- <hash> (tags: <FuncName>_version-<#>)
// +- cache/ (Only present in first deploy of region)
// | +- <hash> (tags: latest)
// +- worker/ (Only present in first deploy of region)
// +- <hash> (tags: latest)
//
// We'll parallel search for the valid <uuid> and their children
// until we find one with the right tag for the function name.
// The underlying Helper's caching should make this expensive for
// the first function and free for the next functions in the same
// region.
async cleanupFunction(func: backend.FunctionSpec): Promise<void> {
const helper = this.helper(func.region);
const uuids = (await helper.ls(`${func.project}/gcf/${func.region}`)).children;

const uuidTags: Record<string, string[]> = {};
const loadUuidTags: Promise<void>[] = [];
for (const uuid of uuids) {
loadUuidTags.push(
(async () => {
const path = `${func.project}/gcf/${func.region}/${uuid}`;
const tags = (await helper.ls(path)).tags;
uuidTags[path] = tags;
})()
);
}
await Promise.all(loadUuidTags);

const extractFunction = /^(.*)_version-\d+$/;
const entry = Object.entries(uuidTags).find(([, tags]) => {
joehan marked this conversation as resolved.
Show resolved Hide resolved
return tags.find((tag) => {
const match = tag.match(extractFunction);
return match && match[1] === func.id;
});
});

if (!entry) {
logger.debug("Could not find image for function", backend.functionName(func));
return;
}
await helper.rm(entry[0]);
}
}

export interface Stat {
children: string[];
digests: gcr.Digest[];
tags: gcr.Tag[];
}

export class ContainerRegistryHelper {
readonly client: gcr.Client;
readonly cache: Record<string, Stat> = {};

constructor(subdomain: string) {
this.client = new gcr.Client(subdomain);
}

async ls(path: string): Promise<Stat> {
if (!this.cache[path]) {
const raw = await this.client.listTags(path);
this.cache[path] = {
tags: raw.tags,
digests: Object.keys(raw.manifest),
children: raw.child,
};
}
return this.cache[path];
}

async rm(path: string): Promise<void> {
const stat = await this.ls(path);
const deleteChildren: Promise<void>[] = [];
const recursive = stat.children.map((child) => this.rm(`${path}/${child}`));
// Let children ("directories") be cleaned up in parallel while we clean
// up the "files" in this location.

const deleteTags = stat.tags.map((tag) => this.client.deleteTag(path, tag));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to delete all tags before starting to delete the images? If not, we could combine these await Promise.all()'s and do them in parallel

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm also a little concerned about how this behaves if just one call fails - for example, if the first Promise.all rejects, this errors and we never try to deleteImages... but we already made our recursive call, so that has started but will get 'cut off' at some arbitrary point.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Tags pin an image and must be removed before an image can be removed. I've added comments to clarify this.
  2. Agreed. I didn't feel super warm and fuzzy about these errors. I've made rm recurse as far as it can irrespective of errors and throw a random error it encounters. Rather than investing in a multi-error type to aggregate FirebaseErrors, I'm just debug logging the error for future reference.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good to me - this is definitely a tricky one to handle and it seems likely that if one call fails, the rest are high likelihood to as well. I feel better about this strategy tho.

await Promise.all(deleteTags);
stat.tags = [];

const deleteImages = stat.digests.map((digest) => this.client.deleteImage(path, digest));
await Promise.all(deleteImages);
stat.digests = [];

await Promise.all(recursive);
stat.children = [];
}
}
10 changes: 6 additions & 4 deletions src/deploy/functions/release.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ import { getAppEngineLocation } from "../../functionsConfig";
import { promptForFunctionDeletion } from "./prompts";
import { DeploymentTimer } from "./deploymentTimer";
import { ErrorHandler } from "./errorHandler";
import * as utils from "../../utils";
import { Options } from "../../options";
import * as args from "./args";
import * as backend from "./backend";
import * as containerCleaner from "./containerCleaner";
import * as helper from "./functionsDeployHelper";
import * as tasks from "./tasks";
import * as backend from "./backend";
import * as args from "./args";
import { Options } from "../../options";
import * as utils from "../../utils";

export async function release(context: args.Context, options: Options, payload: args.Payload) {
if (!options.config.has("functions")) {
Expand Down Expand Up @@ -136,6 +137,7 @@ export async function release(context: args.Context, options: Options, payload:
);
}
helper.logAndTrackDeployStats(cloudFunctionsQueue, errorHandler);
await containerCleaner.cleanupBuildImages(payload.functions!.backend.cloudFunctions);
await helper.printTriggerUrls(context);
errorHandler.printWarnings();
errorHandler.printErrors();
Expand Down
95 changes: 95 additions & 0 deletions src/gcp/containerregistry.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
// Note: unlike Google APIs, the documentation for the GCR API is
// actually the Docker REST API. This can be found at
// https://docs.docker.com/registry/spec/api/
// This API is _very_ complex in its entirety and is very subtle (e.g. tags and digests
// are both strings and can both be put in the same route to get completely different
// response document types).
// This file will only implement a minimal subset as needed.
import { FirebaseError } from "../error";
import { containerRegistryDomain } from "../api";
import * as api from "../apiv2";

// A Digest is a string in the format <algorithm>:<hex>. For example:
// sha256:146d8c9dff0344fb01417ef28673ed196e38215f3c94837ae733d3b064ba439e
export type Digest = string;
export type Tag = string;

export interface Tags {
name: string;
tags: string[];

// These fields are not documented in the Docker API but are
// present in the GCR API.
manifest: Record<Digest, ImageInfo>;
child: string[];
}

export interface ImageInfo {
// times are string milliseconds
timeCreatedMs: string;
timeUploadedMs: string;
tag: string[];
mediaType: string;
imageSizeBytes: string;
layerId: string;
}

interface ErrorsResponse {
errors?: {
code: string;
message: string;
details: unknown;
}[];
}

function isErrors(response: unknown): response is ErrorsResponse {
return Object.prototype.hasOwnProperty.bind(response)("errors");
}

const API_VERSION = "v2";

export class Client {
readonly client: api.Client;

constructor(subdomain?: string) {
let origin: string;
if (subdomain) {
origin = `https://${subdomain}.${containerRegistryDomain}`;
} else {
origin = `https://${containerRegistryDomain}`;
}
this.client = new api.Client({
apiVersion: API_VERSION,
auth: true,
urlPrefix: origin,
});
}

async listTags(path: string): Promise<Tags> {
const response = await this.client.get<Tags | ErrorsResponse>(`${path}/tags/list`);
if (isErrors(response.body)) {
throw new FirebaseError(`Failed to list GCR tags at ${path}`, {
children: response.body.errors,
});
}
return response.body;
}

async deleteTag(path: string, tag: Tag): Promise<void> {
const response = await this.client.delete<ErrorsResponse>(`${path}/manifests/${tag}`);
if (response.body.errors?.length != 0) {
throw new FirebaseError(`Failed to delete tag ${tag} at path ${path}`, {
children: response.body.errors,
});
}
}

async deleteImage(path: string, digest: Digest): Promise<void> {
const response = await this.client.delete<ErrorsResponse>(`${path}/manifests/${digest}`);
if (response.body.errors?.length != 0) {
throw new FirebaseError(`Failed to delete image ${digest} at path ${path}`, {
children: response.body.errors,
});
}
}
}