index.js 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. 'use strict';
  2. // https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
  3. const DATA_URL_DEFAULT_MIME_TYPE = 'text/plain';
  4. const DATA_URL_DEFAULT_CHARSET = 'us-ascii';
  5. const testParameter = (name, filters) => {
  6. return filters.some(filter => filter instanceof RegExp ? filter.test(name) : filter === name);
  7. };
  8. const normalizeDataURL = (urlString, {stripHash}) => {
  9. const match = /^data:(?<type>[^,]*?),(?<data>[^#]*?)(?:#(?<hash>.*))?$/.exec(urlString);
  10. if (!match) {
  11. throw new Error(`Invalid URL: ${urlString}`);
  12. }
  13. let {type, data, hash} = match.groups;
  14. const mediaType = type.split(';');
  15. hash = stripHash ? '' : hash;
  16. let isBase64 = false;
  17. if (mediaType[mediaType.length - 1] === 'base64') {
  18. mediaType.pop();
  19. isBase64 = true;
  20. }
  21. // Lowercase MIME type
  22. const mimeType = (mediaType.shift() || '').toLowerCase();
  23. const attributes = mediaType
  24. .map(attribute => {
  25. let [key, value = ''] = attribute.split('=').map(string => string.trim());
  26. // Lowercase `charset`
  27. if (key === 'charset') {
  28. value = value.toLowerCase();
  29. if (value === DATA_URL_DEFAULT_CHARSET) {
  30. return '';
  31. }
  32. }
  33. return `${key}${value ? `=${value}` : ''}`;
  34. })
  35. .filter(Boolean);
  36. const normalizedMediaType = [
  37. ...attributes
  38. ];
  39. if (isBase64) {
  40. normalizedMediaType.push('base64');
  41. }
  42. if (normalizedMediaType.length !== 0 || (mimeType && mimeType !== DATA_URL_DEFAULT_MIME_TYPE)) {
  43. normalizedMediaType.unshift(mimeType);
  44. }
  45. return `data:${normalizedMediaType.join(';')},${isBase64 ? data.trim() : data}${hash ? `#${hash}` : ''}`;
  46. };
  47. const normalizeUrl = (urlString, options) => {
  48. options = {
  49. defaultProtocol: 'http:',
  50. normalizeProtocol: true,
  51. forceHttp: false,
  52. forceHttps: false,
  53. stripAuthentication: true,
  54. stripHash: false,
  55. stripTextFragment: true,
  56. stripWWW: true,
  57. removeQueryParameters: [/^utm_\w+/i],
  58. removeTrailingSlash: true,
  59. removeSingleSlash: true,
  60. removeDirectoryIndex: false,
  61. sortQueryParameters: true,
  62. ...options
  63. };
  64. urlString = urlString.trim();
  65. // Data URL
  66. if (/^data:/i.test(urlString)) {
  67. return normalizeDataURL(urlString, options);
  68. }
  69. if (/^view-source:/i.test(urlString)) {
  70. throw new Error('`view-source:` is not supported as it is a non-standard protocol');
  71. }
  72. const hasRelativeProtocol = urlString.startsWith('//');
  73. const isRelativeUrl = !hasRelativeProtocol && /^\.*\//.test(urlString);
  74. // Prepend protocol
  75. if (!isRelativeUrl) {
  76. urlString = urlString.replace(/^(?!(?:\w+:)?\/\/)|^\/\//, options.defaultProtocol);
  77. }
  78. const urlObj = new URL(urlString);
  79. if (options.forceHttp && options.forceHttps) {
  80. throw new Error('The `forceHttp` and `forceHttps` options cannot be used together');
  81. }
  82. if (options.forceHttp && urlObj.protocol === 'https:') {
  83. urlObj.protocol = 'http:';
  84. }
  85. if (options.forceHttps && urlObj.protocol === 'http:') {
  86. urlObj.protocol = 'https:';
  87. }
  88. // Remove auth
  89. if (options.stripAuthentication) {
  90. urlObj.username = '';
  91. urlObj.password = '';
  92. }
  93. // Remove hash
  94. if (options.stripHash) {
  95. urlObj.hash = '';
  96. } else if (options.stripTextFragment) {
  97. urlObj.hash = urlObj.hash.replace(/#?:~:text.*?$/i, '');
  98. }
  99. // Remove duplicate slashes if not preceded by a protocol
  100. if (urlObj.pathname) {
  101. urlObj.pathname = urlObj.pathname.replace(/(?<!\b(?:[a-z][a-z\d+\-.]{1,50}:))\/{2,}/g, '/');
  102. }
  103. // Decode URI octets
  104. if (urlObj.pathname) {
  105. try {
  106. urlObj.pathname = decodeURI(urlObj.pathname);
  107. } catch (_) {}
  108. }
  109. // Remove directory index
  110. if (options.removeDirectoryIndex === true) {
  111. options.removeDirectoryIndex = [/^index\.[a-z]+$/];
  112. }
  113. if (Array.isArray(options.removeDirectoryIndex) && options.removeDirectoryIndex.length > 0) {
  114. let pathComponents = urlObj.pathname.split('/');
  115. const lastComponent = pathComponents[pathComponents.length - 1];
  116. if (testParameter(lastComponent, options.removeDirectoryIndex)) {
  117. pathComponents = pathComponents.slice(0, pathComponents.length - 1);
  118. urlObj.pathname = pathComponents.slice(1).join('/') + '/';
  119. }
  120. }
  121. if (urlObj.hostname) {
  122. // Remove trailing dot
  123. urlObj.hostname = urlObj.hostname.replace(/\.$/, '');
  124. // Remove `www.`
  125. if (options.stripWWW && /^www\.(?!www\.)(?:[a-z\-\d]{1,63})\.(?:[a-z.\-\d]{2,63})$/.test(urlObj.hostname)) {
  126. // Each label should be max 63 at length (min: 1).
  127. // Source: https://en.wikipedia.org/wiki/Hostname#Restrictions_on_valid_host_names
  128. // Each TLD should be up to 63 characters long (min: 2).
  129. // It is technically possible to have a single character TLD, but none currently exist.
  130. urlObj.hostname = urlObj.hostname.replace(/^www\./, '');
  131. }
  132. }
  133. // Remove query unwanted parameters
  134. if (Array.isArray(options.removeQueryParameters)) {
  135. for (const key of [...urlObj.searchParams.keys()]) {
  136. if (testParameter(key, options.removeQueryParameters)) {
  137. urlObj.searchParams.delete(key);
  138. }
  139. }
  140. }
  141. if (options.removeQueryParameters === true) {
  142. urlObj.search = '';
  143. }
  144. // Sort query parameters
  145. if (options.sortQueryParameters) {
  146. urlObj.searchParams.sort();
  147. }
  148. if (options.removeTrailingSlash) {
  149. urlObj.pathname = urlObj.pathname.replace(/\/$/, '');
  150. }
  151. const oldUrlString = urlString;
  152. // Take advantage of many of the Node `url` normalizations
  153. urlString = urlObj.toString();
  154. if (!options.removeSingleSlash && urlObj.pathname === '/' && !oldUrlString.endsWith('/') && urlObj.hash === '') {
  155. urlString = urlString.replace(/\/$/, '');
  156. }
  157. // Remove ending `/` unless removeSingleSlash is false
  158. if ((options.removeTrailingSlash || urlObj.pathname === '/') && urlObj.hash === '' && options.removeSingleSlash) {
  159. urlString = urlString.replace(/\/$/, '');
  160. }
  161. // Restore relative protocol, if applicable
  162. if (hasRelativeProtocol && !options.normalizeProtocol) {
  163. urlString = urlString.replace(/^http:\/\//, '//');
  164. }
  165. // Remove http/https
  166. if (options.stripProtocol) {
  167. urlString = urlString.replace(/^(?:https?:)?\/\//, '');
  168. }
  169. return urlString;
  170. };
  171. module.exports = normalizeUrl;