diff --git a/Dockerfile b/Dockerfile index 5e096937..2d571ca6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,6 @@ RUN mkdir /app WORKDIR /app COPY . /app RUN apt-get -qq update -RUN apt-get install -yqq python3-pip -RUN pip3 install -r requirements.txt +RUN apt-get install -yqq theharvester RUN chmod +x *.py ENTRYPOINT ["/app/theHarvester.py"] diff --git a/Pipfile.lock b/Pipfile.lock index aa30e054..0f0d6914 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -301,10 +301,10 @@ }, "soupsieve": { "hashes": [ - "sha256:605f89ad5fdbfefe30cdc293303665eff2d188865d4dbe4eb510bba1edfbfce3", - "sha256:b91d676b330a0ebd5b21719cb6e9b57c57d433671f65b9c28dd3461d9a1ed0b6" + "sha256:bdb0d917b03a1369ce964056fc195cfdff8819c40de04695a80bc813c3cfa1f5", + "sha256:e2c1c5dee4a1c36bcb790e0fabd5492d874b8ebd4617622c4f6a731701060dda" ], - "version": "==1.9.4" + "version": "==1.9.5" }, "texttable": { "hashes": [ diff --git a/theHarvester/discovery/googlefile.py b/theHarvester/discovery/googlefile.py index 26a0779f..0a18979f 100644 --- a/theHarvester/discovery/googlefile.py +++ b/theHarvester/discovery/googlefile.py @@ -12,6 +12,7 @@ def __init__(self, word): self.totalresults = "" self.server = 'www.google.com' self.start = 0 + self.links = set() def do_search(self): filetype = ['doc', 'docx', 'pdf', 'ppt', 'pptx', 'txt', 'xls', 'xlsx'] @@ -27,19 +28,22 @@ def do_search(self): page = requests.get(url, headers=headers) tree = html.fromstring(page.content) self.results = tree.xpath('//*[@class="r"]/a/@href') - + #print('results: ', self.results) for link in self.results: match = re.search(regex, link) if match: - self.totalresults += match.group('urls') + #print('type: ', type(match.group('urls'))) + self.links.update(set(list(match.group('urls')))) + # print(match.group('urls')) else: - self.totalresults += f'{link}' - + # print('not matched') + # print(f'{link}') + self.links.add(link) if self.results: self.start += 100 - def get_links(self): - return self.totalresults + def get_links(self) -> set: + return self.links def process(self): self.do_search()