From a18b50f4ae6fe4b1122561755b48bfea9c3f6c95 Mon Sep 17 00:00:00 2001 From: sairaj mote Date: Thu, 7 Dec 2023 01:08:54 +0530 Subject: [PATCH] adding recursive fetching and hashing --- index.js | 43 ++++++++++++++-- index.min.js | 2 +- package-lock.json | 124 +++++++++++++++++++++++++++++++++++++++++++++- package.json | 3 +- 4 files changed, 165 insertions(+), 7 deletions(-) diff --git a/index.js b/index.js index 8a0b5c1..f200290 100644 --- a/index.js +++ b/index.js @@ -4,6 +4,8 @@ const axios = require('axios'); const { createHash } = require('crypto'); const archiver = require('archiver'); const rateLimit = require('express-rate-limit'); +const { parse: parseUrl } = require('url'); +const { parse: parseHtml } = require('node-html-parser'); const app = express(); const port = process.env.PORT || 3000; @@ -42,7 +44,6 @@ app.use( app.get('/', (req, res) => { res.send('Hello There!'); }) - // hashContent function to hash the content of a file async function hashContent(content) { const hash = createHash('sha256'); @@ -50,6 +51,38 @@ async function hashContent(content) { return hash.digest('hex'); } +// Recursive function to fetch and hash content, including linked resources +async function fetchAndHashContent(url, visitedUrls = new Set()) { + if (visitedUrls.has(url)) { + return ''; // Avoid fetching the same URL multiple times to prevent infinite loops + } + + visitedUrls.add(url); + + const response = await axios.get(url, { responseType: 'arraybuffer', timeout: 10000 }); + const content = response.data.toString('utf-8'); + + // Parse HTML content to identify linked resources + const root = parseHtml(content); + const linkedResources = root.querySelectorAll('link[rel="stylesheet"], script[src]'); + // Fetch and hash linked resources + const linkedResourceHashes = await Promise.all(linkedResources.map(async (resource) => { + const resourceUrl = parseUrl(resource.getAttribute('href') || resource.getAttribute('src'), true); + let absoluteResourceUrl = resourceUrl.href; + if (!resourceUrl.hostname) { + if (!resourceUrl.path.startsWith('/') && !url.endsWith('/')) + url += '/'; + absoluteResourceUrl = `${url}${resourceUrl.path}`; + } + const resourceContent = await fetchAndHashContent(absoluteResourceUrl, visitedUrls); + return `${resourceUrl.path}_${resourceContent}`; + })); + + // Combine the content and hashes of linked resources + return `${content}_${linkedResourceHashes.join('_')}`; +} + + // API endpoint to start the recursive download and hashing app.post('/hash', async (req, res) => { try { @@ -62,15 +95,17 @@ app.post('/hash', async (req, res) => { url = [url]; const promises = url.map(async (url) => { - const response = await axios.get(url, { responseType: 'arraybuffer', timeout: 10000 }); - const fileHash = await hashContent(response.data); + const hashedContent = await fetchAndHashContent(url); + const fileHash = await hashContent(Buffer.from(hashedContent, 'utf-8')); return { url, fileHash }; - }) + }); + let results = await Promise.all(promises); results = results.reduce((acc, { url, fileHash }) => { acc[url] = fileHash; return acc; }, {}); + res.json(results); } catch (error) { console.error('Error:', error.message); diff --git a/index.min.js b/index.min.js index 7ae15ed..ec75f2a 100644 --- a/index.min.js +++ b/index.min.js @@ -1 +1 @@ -require("dotenv").config();const express=require("express"),axios=require("axios"),{createHash:createHash}=require("crypto"),archiver=require("archiver"),rateLimit=require("express-rate-limit"),app=express(),port=process.env.PORT||3e3;app.use(express.json());const allowedDomains=process.env.ALLOWED_DOMAINS.split(",");async function hashContent(content){const hash=createHash("sha256");return hash.update(content),hash.digest("hex")}async function downloadGitHubRepo(owner,repo){if(!owner||!repo)throw new Error("Missing owner or repo");const zipUrl=`https://github.com/${owner}/${repo}/archive/refs/heads/master.zip`;return(await axios.get(zipUrl,{responseType:"arraybuffer"})).data}app.use(rateLimit({windowMs:6e4,max:1})),app.get("/",((req,res)=>{res.send("Hello There!")})),app.post("/hash",(async(req,res)=>{try{console.log("Request:",req.body);let{url:url}=req.body;if(!url)return res.status(400).json({error:"Missing URL in the request parameters"});Array.isArray(url)||(url=[url]);const promises=url.map((async url=>{const response=await axios.get(url,{responseType:"arraybuffer",timeout:1e4});return{url:url,fileHash:await hashContent(response.data)}}));let results=await Promise.all(promises);results=results.reduce(((acc,{url:url,fileHash:fileHash})=>(acc[url]=fileHash,acc)),{}),res.json(results)}catch(error){console.error("Error:",error.message),res.status(500).json({error:"Internal Server Error"})}})),app.post("/download-repos",(async(req,res)=>{try{let{urls:urls}=req.body;if(!urls)return res.status(400).json({error:"Missing urls in the request parameters"});Array.isArray(urls)||(urls=[urls]);const archive=archiver("zip");res.attachment("repos.zip");const downloadPromises=urls.map((async url=>{const[owner,name]=url.split("/").slice(-2);if(!owner||!name)return void console.error(`Invalid url format: ${url}`);const zipBuffer=await downloadGitHubRepo(owner,name);archive.append(zipBuffer,{name:`${owner}-${name}.zip`})}));await Promise.all(downloadPromises),archive.finalize(),archive.pipe(res)}catch(error){console.error("Error:",error.message),res.status(500).json({error:"Internal Server Error"})}})),app.listen(port,(()=>{console.log(`Server is running at http://localhost:${port}`)})),module.exports=app; \ No newline at end of file +require("dotenv").config();const express=require("express"),axios=require("axios"),{createHash:createHash}=require("crypto"),archiver=require("archiver"),rateLimit=require("express-rate-limit"),{parse:parseUrl}=require("url"),{parse:parseHtml}=require("node-html-parser"),app=express(),port=process.env.PORT||3e3;app.use(express.json());const allowedDomains=process.env.ALLOWED_DOMAINS.split(",");async function hashContent(content){const hash=createHash("sha256");return hash.update(content),hash.digest("hex")}async function fetchAndHashContent(url,visitedUrls=new Set){if(visitedUrls.has(url))return"";visitedUrls.add(url);const content=(await axios.get(url,{responseType:"arraybuffer",timeout:1e4})).data.toString("utf-8"),linkedResources=parseHtml(content).querySelectorAll('link[rel="stylesheet"], script[src]');return`${content}_${(await Promise.all(linkedResources.map((async resource=>{const resourceUrl=parseUrl(resource.getAttribute("href")||resource.getAttribute("src"),!0);let absoluteResourceUrl=resourceUrl.href;resourceUrl.hostname||(resourceUrl.path.startsWith("/")||url.endsWith("/")||(url+="/"),absoluteResourceUrl=`${url}${resourceUrl.path}`);const resourceContent=await fetchAndHashContent(absoluteResourceUrl,visitedUrls);return`${resourceUrl.path}_${resourceContent}`})))).join("_")}`}async function downloadGitHubRepo(owner,repo){if(!owner||!repo)throw new Error("Missing owner or repo");const zipUrl=`https://github.com/${owner}/${repo}/archive/refs/heads/master.zip`;return(await axios.get(zipUrl,{responseType:"arraybuffer"})).data}app.use(rateLimit({windowMs:6e4,max:1})),app.get("/",((req,res)=>{res.send("Hello There!")})),app.post("/hash",(async(req,res)=>{try{console.log("Request:",req.body);let{url:url}=req.body;if(!url)return res.status(400).json({error:"Missing URL in the request parameters"});Array.isArray(url)||(url=[url]);const promises=url.map((async url=>{const hashedContent=await fetchAndHashContent(url);return{url:url,fileHash:await hashContent(Buffer.from(hashedContent,"utf-8"))}}));let results=await Promise.all(promises);results=results.reduce(((acc,{url:url,fileHash:fileHash})=>(acc[url]=fileHash,acc)),{}),res.json(results)}catch(error){console.error("Error:",error.message),res.status(500).json({error:"Internal Server Error"})}})),app.post("/download-repos",(async(req,res)=>{try{let{urls:urls}=req.body;if(!urls)return res.status(400).json({error:"Missing urls in the request parameters"});Array.isArray(urls)||(urls=[urls]);const archive=archiver("zip");res.attachment("repos.zip");const downloadPromises=urls.map((async url=>{const[owner,name]=url.split("/").slice(-2);if(!owner||!name)return void console.error(`Invalid url format: ${url}`);const zipBuffer=await downloadGitHubRepo(owner,name);archive.append(zipBuffer,{name:`${owner}-${name}.zip`})}));await Promise.all(downloadPromises),archive.finalize(),archive.pipe(res)}catch(error){console.error("Error:",error.message),res.status(500).json({error:"Internal Server Error"})}})),app.listen(port,(()=>{console.log(`Server is running at http://localhost:${port}`)})),module.exports=app; \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index f40d2de..4fb59ab 100644 --- a/package-lock.json +++ b/package-lock.json @@ -13,7 +13,8 @@ "axios": "^1.6.2", "crypto": "^1.0.1", "express": "^4.18.2", - "express-rate-limit": "^7.1.5" + "express-rate-limit": "^7.1.5", + "node-html-parser": "^6.1.11" }, "devDependencies": { "dotenv": "^16.3.1", @@ -151,6 +152,11 @@ "npm": "1.2.8000 || >= 1.4.16" } }, + "node_modules/boolbase": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", + "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==" + }, "node_modules/brace-expansion": { "version": "1.1.11", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", @@ -326,6 +332,32 @@ "integrity": "sha512-VxBKmeNcqQdiUQUW2Tzq0t377b54N2bMtXO/qiLa+6eRRmmC4qT3D4OnTGoT/U6O9aklQ/jTwbOtRMTTY8G0Ig==", "deprecated": "This package is no longer supported. It's now a built-in Node module. If you've depended on crypto, you should switch to the one that's built-in." }, + "node_modules/css-select": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz", + "integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==", + "dependencies": { + "boolbase": "^1.0.0", + "css-what": "^6.1.0", + "domhandler": "^5.0.2", + "domutils": "^3.0.1", + "nth-check": "^2.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/css-what": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz", + "integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==", + "engines": { + "node": ">= 6" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, "node_modules/debug": { "version": "2.6.9", "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", @@ -372,6 +404,57 @@ "npm": "1.2.8000 || >= 1.4.16" } }, + "node_modules/dom-serializer": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz", + "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.2", + "entities": "^4.2.0" + }, + "funding": { + "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" + } + }, + "node_modules/domelementtype": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz", + "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ] + }, + "node_modules/domhandler": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz", + "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==", + "dependencies": { + "domelementtype": "^2.3.0" + }, + "engines": { + "node": ">= 4" + }, + "funding": { + "url": "https://github.com/fb55/domhandler?sponsor=1" + } + }, + "node_modules/domutils": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.1.0.tgz", + "integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==", + "dependencies": { + "dom-serializer": "^2.0.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3" + }, + "funding": { + "url": "https://github.com/fb55/domutils?sponsor=1" + } + }, "node_modules/dotenv": { "version": "16.3.1", "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.3.1.tgz", @@ -397,6 +480,17 @@ "node": ">= 0.8" } }, + "node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/escape-html": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz", @@ -706,6 +800,14 @@ "node": ">= 0.4" } }, + "node_modules/he": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz", + "integrity": "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==", + "bin": { + "he": "bin/he" + } + }, "node_modules/http-errors": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz", @@ -938,6 +1040,15 @@ "node": ">= 0.6" } }, + "node_modules/node-html-parser": { + "version": "6.1.11", + "resolved": "https://registry.npmjs.org/node-html-parser/-/node-html-parser-6.1.11.tgz", + "integrity": "sha512-FAgwwZ6h0DSDWxfD0Iq1tsDcBCxdJB1nXpLPPxX8YyVWzbfCjKWEzaynF4gZZ/8hziUmp7ZSaKylcn0iKhufUQ==", + "dependencies": { + "css-select": "^5.1.0", + "he": "1.2.0" + } + }, "node_modules/nodemon": { "version": "3.0.2", "resolved": "https://registry.npmjs.org/nodemon/-/nodemon-3.0.2.tgz", @@ -1012,6 +1123,17 @@ "node": ">=0.10.0" } }, + "node_modules/nth-check": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", + "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==", + "dependencies": { + "boolbase": "^1.0.0" + }, + "funding": { + "url": "https://github.com/fb55/nth-check?sponsor=1" + } + }, "node_modules/object-inspect": { "version": "1.13.1", "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.1.tgz", diff --git a/package.json b/package.json index c2baa8b..058ccb3 100644 --- a/package.json +++ b/package.json @@ -14,7 +14,8 @@ "axios": "^1.6.2", "crypto": "^1.0.1", "express": "^4.18.2", - "express-rate-limit": "^7.1.5" + "express-rate-limit": "^7.1.5", + "node-html-parser": "^6.1.11" }, "devDependencies": { "dotenv": "^16.3.1",