Skip to content

Commit

Permalink
core(robots): use new fetcher to get robots.txt (#12423)
Browse files Browse the repository at this point in the history
  • Loading branch information
adamraine committed May 4, 2021
1 parent 7a6bc05 commit 03adece
Show file tree
Hide file tree
Showing 7 changed files with 133 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,17 @@ module.exports = [
},
{
artifacts: {
_maxChromiumMilestone: 91,
RobotsTxt: {
status: null,
content: null,
},
InspectorIssues: {
contentSecurityPolicy: [
{
// TODO: Fix connect-src violation when fetching robots.txt.
// https://github.com/GoogleChrome/lighthouse/issues/10225
//
// Fixed with new fetcher using M92.
blockedURL: 'http://localhost:10200/robots.txt',
violatedDirective: 'connect-src',
isReportOnly: false,
Expand All @@ -75,7 +77,6 @@ module.exports = [
// https://github.com/GoogleChrome/lighthouse/pull/12044#issuecomment-788274938
//
// Fixed with new fetcher using M92.
_maxChromiumMilestone: 91,
sourceMapUrl: 'http://localhost:10200/source-map/script.js.map',
errorMessage: 'Error: Timed out fetching resource.',
map: undefined,
Expand All @@ -87,4 +88,35 @@ module.exports = [
audits: {},
},
},
{
// Same CSP as above, but verifies correct behavior for M92.
artifacts: {
_minChromiumMilestone: 92,
RobotsTxt: {
status: 404,
content: null,
},
InspectorIssues: {
contentSecurityPolicy: [
{
// TODO: Fix style-src-elem violation when checking tap targets.
// https://github.com/GoogleChrome/lighthouse/issues/11862
violatedDirective: 'style-src-elem',
isReportOnly: false,
contentSecurityPolicyViolationType: 'kInlineViolation',
},
],
},
SourceMaps: [{
sourceMapUrl: 'http://localhost:10200/source-map/script.js.map',
map: {},
errorMessage: undefined,
}],
},
lhr: {
requestedUrl: 'http://localhost:10200/csp.html?' + blockAllExceptInlineScriptCsp,
finalUrl: 'http://localhost:10200/csp.html?' + blockAllExceptInlineScriptCsp,
audits: {},
},
},
];
63 changes: 39 additions & 24 deletions lighthouse-core/gather/fetcher.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

/* global document */

/** @typedef {{content: string|null, status: number|null}} FetchResponse */

const log = require('lighthouse-logger');
const {getBrowserVersion} = require('./driver/environment.js');

Expand Down Expand Up @@ -97,7 +99,7 @@ class Fetcher {
*
* @param {string} url
* @param {{timeout: number}=} options timeout is in ms
* @return {Promise<string>}
* @return {Promise<FetchResponse>}
*/
async fetchResource(url, options = {timeout: 500}) {
if (!this._enabled) {
Expand All @@ -107,7 +109,7 @@ class Fetcher {
// `Network.loadNetworkResource` was introduced in M88.
// The long timeout bug with `IO.read` was fixed in M92:
// https://bugs.chromium.org/p/chromium/issues/detail?id=1191757
const milestone = await getBrowserVersion(this.session).then(v => v.milestone);
const {milestone} = await getBrowserVersion(this.session);
if (milestone >= 92) {
return await this._fetchResourceOverProtocol(url, options);
}
Expand Down Expand Up @@ -141,7 +143,7 @@ class Fetcher {

/**
* @param {string} url
* @return {Promise<LH.Crdp.IO.StreamHandle>}
* @return {Promise<{stream: LH.Crdp.IO.StreamHandle|null, status: number|null}>}
*/
async _loadNetworkResource(url) {
await this.session.sendCommand('Network.enable');
Expand All @@ -156,18 +158,28 @@ class Fetcher {
});
await this.session.sendCommand('Network.disable');

if (!networkResponse.resource.success || !networkResponse.resource.stream) {
const statusCode = networkResponse.resource.httpStatusCode || '';
throw new Error(`Loading network resource failed (${statusCode})`);
}
return {
stream: networkResponse.resource.success ? (networkResponse.resource.stream || null) : null,
status: networkResponse.resource.httpStatusCode || null,
};
}

return networkResponse.resource.stream;
/**
* @param {string} requestId
* @return {Promise<string>}
*/
async _resolveResponseBody(requestId) {
const responseBody = await this.session.sendCommand('Fetch.getResponseBody', {requestId});
if (responseBody.base64Encoded) {
return Buffer.from(responseBody.body, 'base64').toString();
}
return responseBody.body;
}

/**
* @param {string} url
* @param {{timeout: number}} options timeout is in ms
* @return {Promise<string>}
* @return {Promise<FetchResponse>}
*/
async _fetchResourceOverProtocol(url, options) {
const startTime = Date.now();
Expand All @@ -180,21 +192,28 @@ class Fetcher {
}, options.timeout);
});

const fetchStreamPromise = this._loadNetworkResource(url);
const stream = await Promise.race([fetchStreamPromise, timeoutPromise])
const responsePromise = this._loadNetworkResource(url);

/** @type {{stream: LH.Crdp.IO.StreamHandle|null, status: number|null}} */
const response = await Promise.race([responsePromise, timeoutPromise])
.finally(() => clearTimeout(timeoutHandle));

return await this._readIOStream(stream, {timeout: options.timeout - (Date.now() - startTime)});
const isOk = response.status && response.status >= 200 && response.status <= 299;
if (!response.stream || !isOk) return {status: response.status, content: null};

const timeout = options.timeout - (Date.now() - startTime);
const content = await this._readIOStream(response.stream, {timeout});
return {status: response.status, content};
}

/**
* Fetches resource by injecting an iframe into the page.
* @param {string} url
* @param {{timeout: number}} options timeout is in ms
* @return {Promise<string>}
* @return {Promise<FetchResponse>}
*/
async _fetchResourceIframe(url, options) {
/** @type {Promise<string>} */
/** @type {Promise<FetchResponse>} */
const requestInterceptionPromise = new Promise((resolve, reject) => {
/** @param {LH.Crdp.Fetch.RequestPausedEvent} event */
const handlerAsync = async event => {
Expand All @@ -216,17 +235,13 @@ class Fetcher {
return;
}

// Now in the response stage, but the request failed.
if (!(responseStatusCode >= 200 && responseStatusCode < 300)) {
reject(new Error(`Invalid response status code: ${responseStatusCode}`));
return;
}

const responseBody = await this.session.sendCommand('Fetch.getResponseBody', {requestId});
if (responseBody.base64Encoded) {
resolve(Buffer.from(responseBody.body, 'base64').toString());
if (responseStatusCode >= 200 && responseStatusCode <= 299) {
resolve({
status: responseStatusCode,
content: await this._resolveResponseBody(requestId),
});
} else {
resolve(responseBody.body);
resolve({status: responseStatusCode, content: null});
}

// Fail the request (from the page's perspective) so that the iframe never loads.
Expand Down
23 changes: 18 additions & 5 deletions lighthouse-core/gather/gatherers/seo/robots-txt.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
'use strict';

const FRGatherer = require('../../../fraggle-rock/gather/base-gatherer.js');
const {getBrowserVersion} = require('../../driver/environment.js');

/* global fetch, location */

Expand Down Expand Up @@ -36,11 +37,23 @@ class RobotsTxt extends FRGatherer {
* @param {LH.Gatherer.FRTransitionalContext} passContext
* @return {Promise<LH.Artifacts['RobotsTxt']>}
*/
getArtifact(passContext) {
return passContext.driver.executionContext.evaluate(getRobotsTxtContent, {
args: [],
useIsolation: true,
});
async getArtifact(passContext) {
const {milestone} = await getBrowserVersion(passContext.driver.defaultSession);

// TODO: Remove when 92 hits stable.
// Iframe fetcher still has issues with CSPs.
// Only use the fetcher if we are fetching over the CDP.
if (milestone < 92) {
return passContext.driver.executionContext.evaluate(getRobotsTxtContent, {
args: [],
useIsolation: true,
});
}

const robotsUrl = new URL('/robots.txt', passContext.url).href;
await passContext.driver.fetcher.enable();
return passContext.driver.fetcher.fetchResource(robotsUrl)
.catch(() => ({status: null, content: null}));
}
}

Expand Down
8 changes: 5 additions & 3 deletions lighthouse-core/gather/gatherers/source-maps.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,11 @@ class SourceMaps extends Gatherer {
* @return {Promise<LH.Artifacts.RawSourceMap>}
*/
async fetchSourceMap(driver, sourceMapUrl) {
/** @type {string} */
const sourceMapJson = await driver.fetcher.fetchResource(sourceMapUrl, {timeout: 1500});
return JSON.parse(sourceMapJson);
const response = await driver.fetcher.fetchResource(sourceMapUrl, {timeout: 1500});
if (response.content === null) {
throw new Error(`Failed fetching source map (${response.status})`);
}
return JSON.parse(response.content);
}

/**
Expand Down
8 changes: 4 additions & 4 deletions lighthouse-core/test/gather/fetcher-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,10 @@ describe('._fetchResourceOverProtocol', () => {
.mockResponse('Network.disable');

const data = await fetcher._fetchResourceOverProtocol('https://example.com', {timeout: 500});
expect(data).toEqual(streamContents);
expect(data).toEqual({content: streamContents, status: 200});
});

it('throws when resource could not be fetched', async () => {
it('returns null when resource could not be fetched', async () => {
connectionStub.sendCommand = createMockSendCommandFn()
.mockResponse('Network.enable')
.mockResponse('Page.getFrameTree', {frameTree: {frame: {id: 'FRAME'}}})
Expand All @@ -145,8 +145,8 @@ describe('._fetchResourceOverProtocol', () => {
})
.mockResponse('Network.disable');

const dataPromise = fetcher._fetchResourceOverProtocol('https://example.com', {timeout: 500});
await expect(dataPromise).rejects.toThrowError(/Loading network resource failed/);
const data = await fetcher._fetchResourceOverProtocol('https://example.com', {timeout: 500});
expect(data).toEqual({content: null, status: 404});
});

it('throws on timeout', async () => {
Expand Down
35 changes: 32 additions & 3 deletions lighthouse-core/test/gather/gatherers/source-maps-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ describe('SourceMaps gatherer', () => {
* `resolvedSourceMapUrl` is used to assert that the SourceMaps gatherer is using the expected
* url to fetch the source map.
* `fetchError` mocks an error that happens in the page. Only fetch error message make sense.
* @param {Array<{scriptParsedEvent: LH.Crdp.Debugger.ScriptParsedEvent, map: string, resolvedSourceMapUrl?: string, fetchError: string}>} mapsAndEvents
* @param {Array<{scriptParsedEvent: LH.Crdp.Debugger.ScriptParsedEvent, map: string, status?: number, resolvedSourceMapUrl?: string, fetchError: string}>} mapsAndEvents
* @return {Promise<LH.Artifacts['SourceMaps']>}
*/
async function runSourceMaps(mapsAndEvents) {
Expand All @@ -50,7 +50,14 @@ describe('SourceMaps gatherer', () => {
.mockResponse('Fetch.disable', {});
const fetchMock = jest.fn();

for (const {scriptParsedEvent, map, resolvedSourceMapUrl, fetchError} of mapsAndEvents) {
for (const mapAndEvents of mapsAndEvents) {
const {
scriptParsedEvent,
map,
status = null,
resolvedSourceMapUrl,
fetchError,
} = mapAndEvents;
onMock.mockEvent('protocolevent', {
method: 'Debugger.scriptParsed',
params: scriptParsedEvent,
Expand All @@ -71,7 +78,7 @@ describe('SourceMaps gatherer', () => {
throw new Error(fetchError);
}

return map;
return {content: map, status};
});
}
const connectionStub = new Connection();
Expand Down Expand Up @@ -173,6 +180,28 @@ describe('SourceMaps gatherer', () => {
]);
});

it('throws an error message when fetching map returns bad status code', async () => {
const mapsAndEvents = [
{
scriptParsedEvent: {
url: 'http://www.example.com/bundle.js',
sourceMapURL: 'http://www.example.com/bundle.js.map',
},
status: 404,
map: null,
},
];
const artifact = await runSourceMaps(mapsAndEvents);
expect(artifact).toEqual([
{
scriptUrl: mapsAndEvents[0].scriptParsedEvent.url,
sourceMapUrl: mapsAndEvents[0].scriptParsedEvent.sourceMapURL,
errorMessage: 'Error: Failed fetching source map (404)',
map: undefined,
},
]);
});

it('generates an error message when fetching map fails', async () => {
const mapsAndEvents = [
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ Gathering: TagsBlockingFirstPaint
Gathering: FontSize
Gathering: EmbeddedContent
Gathering: RobotsTxt
Getting browser version
Gathering: TapTargets
Gathering: Accessibility
Gathering: TraceElements
Expand Down

0 comments on commit 03adece

Please sign in to comment.