├── lastpymile ├── __init__.py ├── utils.py ├── gitrepository.py ├── pypackage.py ├── abstractpackageanalysis.py └── maliciouscodepackageanalyzer.py ├── test ├── test_scanning_a_package.py └── test_scanning_an_artifact.py ├── .gitignore ├── requirements.txt ├── pyproject.toml ├── lastpymile.py ├── README.md ├── LICENSE └── poetry.lock /lastpymile/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/test_scanning_a_package.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scanning a package on PyPI and generates a list of phanton files and lines if any 3 | """ 4 | -------------------------------------------------------------------------------- /test/test_scanning_an_artifact.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scanning an artifact (a specific release) and report any phantom files or lines 3 | """ 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv/ 2 | .vscode/ 3 | tmp/ 4 | __pycache__/ 5 | /lastpymile_dev*.py 6 | 7 | 8 | 9 | # Repository wide ignore mac DS_Store files 10 | .DS_Store -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2021.10.8 2 | charset-normalizer==2.0.8 3 | coloredlogs==15.0.1 4 | gitdb==4.0.9 5 | GitPython==3.1.24 6 | humanfriendly==10.0 7 | idna==3.3 8 | lxml==4.6.4 9 | pyreadline3==3.3 10 | requests==2.26.0 11 | smmap==5.0.0 12 | typing-extensions==4.0.1 13 | urllib3==1.26.7 14 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "lastpymile" 3 | version = "0.1.0" 4 | description = "A reimplementation of LastPyMile: A Python-based library to Identify the differences between build artifacts of PyPI packages and the respective source code repository" 5 | authors = ["vuduclyunitn "] 6 | license = "Apache License 2.0" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.9" 10 | certifi = "^2021.10.8" 11 | charset-normalizer = "^2.0.10" 12 | coloredlogs = "^15.0.1" 13 | gitdb = "^4.0.9" 14 | GitPython = "^3.1.25" 15 | humanfriendly = "^10.0" 16 | idna = "^3.3" 17 | lxml = "^4.7.1" 18 | pyreadline3 = "^3.3" 19 | requests = "^2.27.1" 20 | smmap = "^5.0.0" 21 | typing-extensions = "^4.0.1" 22 | urllib3 = "^1.26.8" 23 | 24 | [tool.poetry.dev-dependencies] 25 | 26 | [build-system] 27 | requires = ["poetry-core>=1.0.0"] 28 | build-backend = "poetry.core.masonry.api" 29 | -------------------------------------------------------------------------------- /lastpymile.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser,ArgumentTypeError 2 | import logging, coloredlogs 3 | import os, pathlib 4 | import json 5 | 6 | from lastpymile.utils import Utils 7 | from lastpymile.maliciouscodepackageanalyzer import MaliciousCodePackageAnalyzer 8 | 9 | 10 | class LastPyMileApplication(): 11 | 12 | @staticmethod 13 | def __packageType(package): 14 | if len(package.split(":")) >2: 15 | raise ArgumentTypeError("Invlaid package name ") 16 | return package 17 | 18 | @staticmethod 19 | def __logLevelType(x): 20 | x = int(x) 21 | if x==0: 22 | return 100 23 | elif x==1: 24 | return logging.CRITICAL 25 | elif x==2: 26 | return logging.ERROR 27 | elif x==3: 28 | return logging.WARNING 29 | elif x==4: 30 | return logging.INFO 31 | elif x==5: 32 | return logging.DEBUG 33 | else: 34 | raise ArgumentTypeError("Log level must be between 0 and 5") 35 | 36 | def __init__(self): 37 | parser = ArgumentParser() 38 | 39 | parser.add_argument( 40 | 'package', 41 | type=str, 42 | help='Package name can be in the form :. If no version is specified the latest version is retrieved.' 43 | ) 44 | parser.add_argument( 45 | '-lv', '--loglevel', 46 | type=LastPyMileApplication.__logLevelType, 47 | default=logging.INFO, 48 | help='Log level. From 0(no log) to 5(debug). default(3)', 49 | ) 50 | parser.add_argument( 51 | '-f', '--reportfile', 52 | type=str, 53 | default=None, 54 | help='Write the report to the specified file', 55 | ) 56 | parser.add_argument( 57 | '-o', 58 | action='store_true', 59 | help='Print the report to the screen', 60 | ) 61 | 62 | 63 | args = parser.parse_args() 64 | 65 | l=logging.getLogger("lastpymile") 66 | coloredlogs.install(logger=l,level=args.loglevel) 67 | 68 | 69 | rl=logging.getLogger("lastpymile_report") 70 | rl.setLevel(logging.DEBUG) 71 | 72 | 73 | ch = logging.StreamHandler() 74 | ch.setLevel(logging.DEBUG) 75 | 76 | ch.setFormatter(CustomFormatter()) 77 | rl.addHandler(ch) 78 | 79 | try: 80 | 81 | pakage=args.package.split(":") 82 | pakage_name=pakage[0] 83 | pakage_version=pakage[1] if len(pakage)==2 else None 84 | 85 | 86 | current_folder=pathlib.Path().resolve() 87 | tmp_folder=os.path.join(current_folder,"tmp") 88 | if not os.path.exists(tmp_folder): 89 | os.makedirs(tmp_folder) 90 | package_analysis = MaliciousCodePackageAnalyzer.createAnaliysisForPackage(pakage_name,pakage_version, checked=True) 91 | if package_analysis is not None: 92 | analysis_report=package_analysis.startAnalysis() 93 | json_report=json.dumps(analysis_report,indent=3) 94 | if args.reportfile is not None: 95 | with open(args.reportfile, "w") as f: 96 | f.write(json_report) 97 | if args.reportfile is None or args.o is True: 98 | print(json_report) 99 | 100 | except Exception as e: 101 | import traceback 102 | l.critical("Exception in main code: {}\n{}".format(e,traceback.format_exc())) 103 | 104 | 105 | class CustomFormatter(logging.Formatter): 106 | 107 | white= "\u001b[37m" 108 | grey = "\x1b[38;21m" 109 | green = "\u001b[32m" 110 | orange = "\u001b[35m" 111 | yellow = "\x1b[33;21m" 112 | red = "\x1b[31;21m" 113 | bold_red = "\x1b[31;1m" 114 | reset = "\x1b[0m" 115 | format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)" 116 | 117 | format = "Lastymile Report: %(message)s" 118 | 119 | FORMATS = { 120 | logging.DEBUG: white + format + reset, 121 | logging.INFO: green + format + reset, 122 | logging.WARNING: yellow + format + reset, 123 | logging.ERROR: red + format + reset, 124 | logging.CRITICAL: bold_red + format + reset 125 | } 126 | 127 | def format(self, record): 128 | log_fmt = self.FORMATS.get(record.levelno) 129 | formatter = logging.Formatter(log_fmt) 130 | return formatter.format(record) 131 | 132 | if __name__ == "__main__": 133 | LastPyMileApplication() 134 | 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /lastpymile/utils.py: -------------------------------------------------------------------------------- 1 | import os, stat 2 | from pathlib import Path 3 | import requests, urllib 4 | import re 5 | 6 | class Utils(): 7 | """ 8 | Utility class with static functions 9 | """ 10 | 11 | @staticmethod 12 | def sanitizeFolderName(folder_name, max_length=None): 13 | """ 14 | Sanitize a string to be used as a folder name, and optionally truncate its' lenght to the specified value. 15 | In particular remove all caharachters except letters,numbers,dot and underscore, and replace all spaces with undersocore 16 | 17 | Parameters: 18 | folder_name(str): The string to be sanitized 19 | max_length(int): The max lenght of the returned sanitized name 20 | 21 | """ 22 | sanitized=re.sub(r'[^A-Za-z\d\._-]', '', folder_name.replace(" ","_")) 23 | if max_length is not None and len(sanitized)>max_length: 24 | sanitized=sanitized[:max_length] 25 | return sanitized 26 | 27 | @staticmethod 28 | def ensureFilePath(file_path:str) -> None: 29 | """ 30 | Ensure the exitance of a file path. Furthermore it the file exist it's deleted 31 | 32 | Parameters: 33 | file_name(str): The path of the file to ensure 34 | """ 35 | if os.path.exists(file_path): 36 | os.remove(file_path) 37 | else: 38 | Utils.ensureFolderPath(Path(file_path).parent) 39 | 40 | @staticmethod 41 | def ensureFolderPath(folder_path): 42 | """ 43 | Ensure the exitance of a folder path 44 | 45 | Parameters: 46 | folder_path(str): The path of the folder to ensure 47 | """ 48 | if not os.path.exists(folder_path): 49 | os.makedirs(folder_path) 50 | 51 | 52 | @staticmethod 53 | def rmtree(folder_path): 54 | """ 55 | Custom reimplementation of shutil.rmtree that work under windows.(shutil.rmtree raise exeptions under windows if permissions are not right) 56 | 57 | Parameters: 58 | folder_path(str): The path of the folder to remove 59 | """ 60 | for root, dirs, files in os.walk(folder_path, topdown=False): 61 | for name in files: 62 | filename = os.path.join(root, name) 63 | os.chmod(filename, stat.S_IWUSR) 64 | os.remove(filename) 65 | for name in dirs: 66 | os.rmdir(os.path.join(root, name)) 67 | os.rmdir(folder_path) 68 | 69 | 70 | @staticmethod 71 | def __isUrlAvailable(url:str) -> bool: 72 | """ 73 | Perform an url HEAD request to test if a url is available 74 | 75 | Parameters: 76 | url(str): The url to test 77 | 78 | Return (bool): 79 | True if the url is available, False otherwise 80 | """ 81 | resp=requests.head(url) 82 | return resp.status_code >=200 and resp.status_code<300 83 | 84 | @staticmethod 85 | def getUrlContent(url:str, cheked:bool=False) -> bytes: 86 | """ 87 | Retrieve the content of the specified url 88 | 89 | Parameters: 90 | url(str): The url to download 91 | cheked(bool): If True no exception is raised and None is returned 92 | 93 | Return (bool): 94 | True if the url is available, False otherwise 95 | """ 96 | if not Utils.__isUrlAvailable(url): 97 | if cheked==True: 98 | return False 99 | raise Exception("Url {} not available".format(url)) 100 | try: 101 | response=requests.get(url) 102 | if(response.status_code>=200 and response.status_code<300): 103 | return response.content 104 | else: 105 | raise Exception("Url '{}' response code {}".format(url,response.status_code)) 106 | except Exception as e: 107 | if cheked==True: 108 | return None 109 | raise e 110 | 111 | @staticmethod 112 | def downloadUrl(url:str, dest_file:str, cheked:bool=False) -> bool: 113 | """ 114 | Download a file to the specified location 115 | 116 | Parameters: 117 | url(str): The url to download 118 | dest_file(str): The path where to save the file 119 | cheked(bool): If True no exception is raised and False is returned 120 | 121 | Return (bool): 122 | True if the url has successfully downaloded, False otherwise 123 | """ 124 | if not Utils.__isUrlAvailable(url): 125 | if cheked==True: 126 | return False 127 | raise Exception("Url {} not available".format(url)) 128 | Utils.ensureFilePath(dest_file) 129 | try: 130 | urllib.request.urlretrieve(url, dest_file) 131 | # with requests.get(url, stream=True) as r: 132 | # with open(destFile, 'wb') as f: 133 | # shutil.copyfileobj(r.raw, f) 134 | return True 135 | except Exception as e: 136 | if os.path.exists(dest_file): 137 | os.remove(dest_file) 138 | if cheked==True: 139 | return False 140 | raise e 141 | -------------------------------------------------------------------------------- /lastpymile/gitrepository.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import os 3 | from typing import Callable 4 | from git import Repo 5 | import git 6 | 7 | class GitRepository: 8 | """ 9 | Useful class that wrap the git.Repo class 10 | """ 11 | 12 | @staticmethod 13 | def cloneFromUrl(repository_url:str, clone_path:str) -> GitRepository: 14 | """ 15 | Static method to create a GitRepository object, cloning a remote repository 16 | Parameters: 17 | repository_url (str): the url of the git repository to clone 18 | clone_path (str): a disk path where the repository will be cloned. 19 | 20 | Return (GitRepository): 21 | A GitRepository object to manage the repository 22 | 23 | Raise (GitException): If the repository cannto be cloned 24 | """ 25 | try: 26 | repo=Repo.clone_from(repository_url, clone_path) 27 | return GitRepository(repo,clone_path,repository_url) 28 | except Exception as e: 29 | raise GitException("Error encountered while cloning repository from {}".format(repository_url)) from e 30 | 31 | @staticmethod 32 | def loadFromPath(repository_path:str) -> GitRepository: 33 | """ 34 | Static method to create a GitRepository object, loading the repository from a local folder 35 | Parameters: 36 | repository_path (str): a disk path where the repository is located. 37 | 38 | Return (GitRepository): 39 | A GitRepository object to manage the repository 40 | 41 | Raise (GitException): If the repository cannto be loaded 42 | """ 43 | try: 44 | repo=Repo(path=repository_path) 45 | return GitRepository(repo,repository_path) 46 | except Exception as e: 47 | raise GitException("Error encountered while loadin repository from {}".format(repository_path)) from e 48 | 49 | 50 | def __init__(self, repository:Repo, repository_folder:str, repository_url:str=None): 51 | self.repo=repository 52 | self.repository_folder=repository_folder 53 | self.repository_url=repository_url 54 | 55 | def getRepositoryUrl(self) -> str: 56 | """ 57 | Return the remote repository url if the repository was cloned from an url 58 | 59 | Return (str): 60 | the remote repository url or None if the repository was loaded from a local disk folder 61 | """ 62 | return self.repository_url 63 | 64 | def getRepositoryFolder(self) -> str: 65 | """ 66 | Return the disk path location where this repository is located 67 | 68 | Return (str): 69 | the disk path location where this repository is located 70 | """ 71 | return self.repository_folder 72 | 73 | def getCommitsList(self) -> list[str]: 74 | """ 75 | Return a list of all commit's hashes present in the repository 76 | 77 | Return (str): 78 | a list of all commit's hashes present in the repository 79 | """ 80 | return list(self.repo.git.rev_list('--all','--remotes').split("\n")) 81 | 82 | def checkoutCommit(self, commit_hash:str) -> git.objects.commit.Commit: 83 | """ 84 | Checkout the specified commit 85 | 86 | Return (git.objects.commit.Commit): 87 | a git.objects.commit.Commit Object 88 | """ 89 | if self.repo.head.object.hexsha!=commit_hash: 90 | self.repo.git.checkout(commit_hash) 91 | return self.repo.head.object 92 | 93 | def getCommitEntryContent(self,commit_hash:str,file_path:str) -> bytes: 94 | """ 95 | Get the content of a file in the specified commit. 96 | 97 | Return (bytes): 98 | the file content of the specified file 99 | """ 100 | self.checkoutCommit(commit_hash) 101 | with open(os.path.join(self.repository_folder,file_path), 'rb') as f: 102 | return f.read() 103 | ## 104 | ## Important!! DO NOT USE self.repo.git.show since it use the STD_OUT to capture the content of the file and can alter the real file content (remove empty lines/has bad encoing) 105 | ## 106 | # return self.repo.git.show('{}:{}'.format(commit_hash, file_path)) 107 | 108 | def getFilesAtCommit(self, commit:git.objects.commit.Commit, filter:Callable[[str], bool]=None) -> list[str]: 109 | """ 110 | Return the list of all files at the specified commit 111 | Patameters: 112 | commit(git.objects.commit.Commit): a commit object 113 | filter(Callable[[str], bool]=None)): an optional filter function to filter the result. 114 | The function has a str parameter with the file path (relative to the repository) and must return a bool, 115 | where True will add the file to the result and False will exclude it. 116 | Return (bytes): 117 | the file content of the specified file 118 | """ 119 | commit_files=[] 120 | for element in commit.tree.traverse(): 121 | if filter is None or filter(element.path)==True: 122 | commit_files.append(element.path) 123 | return commit_files 124 | 125 | class GitException(Exception): 126 | def __init__(self,message): 127 | super().__init__(message) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # lastpymile 2 | # `LastPyMile`: Identify the differences between build artifacts of PyPI packages and the respective source code repository 3 | The paper has been published in the proceeding of [ESEC/FSE 2021](https://dl.acm.org/doi/10.1145/3468264.3468592). 4 | 5 | Figure below in an overview of the LastPyMile workflow and internal components: 6 | 7 | 8 | `LastPyMile` extends the current package scanning techniques for malware injections. 9 | The tool analyzes a package from the [PyPI](https://pypi.org) repository by: 10 | 1. Identifying the discrepancy (files and lines) between the source code and the package's artifact 11 | 2. Scanning the discrepancy using Yara rules ([MalwareCheck patterns](https://github.com/pypa/warehouse/blob/main/warehouse/malware/checks/setup_patterns/check.py)) and AST code analysis ([Bandit4mal patterns](https://github.com/lyvd/bandit4mal)). 12 | 13 | As such, `LastPyMile` aims to detect malicious packages in the package owner hijacking, typosquatting/combosquatting attacks (See [Ohm et al.](https://link.springer.com/chapter/10.1007/978-3-030-52683-2_2), [Vu et al.](https://ieeexplore.ieee.org/abstract/document/9229803)). In these attacks, malicious code is injected into a package's artifact, which does not exist in the source code repository. 14 | 15 | In comparison to [the existing scanning tools employed by PyPI](https://warehouse.readthedocs.io/development/malware-checks.html#malware-checks) 16 | LastPyMile reduces the number of alerts produced by a malware checking tool to a number that a human can check. Also, it 17 | removes all the alerts from benign packages, and therefore, allows a clear distinction between benign and malicious 18 | packages. 19 | 20 | ## History 21 | `LastPyMile` is originally developed by [SAP Security Research](https://www.sap.com/documents/2017/12/cc047065-e67c-0010-82c7-eda71af511fa.html) 22 | and [Security Group at the University of Trento](https://securitylab.disi.unitn.it/doku.php?id=start). 23 | 24 | The tool is best described in the following scientific papers, please cite these if you use the tool for your research work: 25 | - [Duc-Ly Vu](https://scholar.google.com/citations?hl=en&user=sl1ofC0AAAAJ), [Ivan Pashchenko](https://scholar.google.com/citations?user=Zy55O-YAAAAJ&hl=en), 26 | [Fabio Massacci](https://scholar.google.com/citations?user=gC_ZVPgAAAAJ&hl=en), [Henrik Plate](https://scholar.google.com/citations?user=Kaleo5YAAAAJ&hl=en), [Antonino Sabetta](https://scholar.google.com/citations?hl=en&user=BhcceV8AAAAJ), [**Towards Using Source Code Repositories to Identify Software Supply Chain Attacks**](https://dl.acm.org/doi/abs/10.1145/3372297.3420015), ACM CCS 2020. 27 | - [Duc-Ly Vu](https://scholar.google.com/citations?hl=en&user=sl1ofC0AAAAJ), [Ivan Pashchenko](https://scholar.google.com/citations?user=Zy55O-YAAAAJ&hl=en), 28 | [Fabio Massacci](https://scholar.google.com/citations?user=gC_ZVPgAAAAJ&hl=en), [Henrik Plate](https://scholar.google.com/citations?user=Kaleo5YAAAAJ&hl=en), [Antonino Sabetta](https://scholar.google.com/citations?hl=en&user=BhcceV8AAAAJ), [**LastPyMile: identifying the discrepancy between sources and packages**](), ESEC/FSE 2021. 29 | 30 | ## Features 31 | - Identify the Github URL of a PyPI package 32 | - Identify the differences between build artifacts of software packages and the respective source code repository 33 | - Scan the differences using Yara rules and [bandit4mal](https://github.com/lyvd/bandit4mal) 34 | - Process a repository and artifact in parallel 35 | 36 | ## Installation 37 | *Requires python 3.9* 38 | - At the root directory, run: ```poetry install``` to install package 39 | dependencies. This will also install [pytest](https://docs.pytest.org/en/6.2.x/) for testing the project. 40 | - At the root directory, run: ```poetry shell``` to active the environment 41 | - 42 | ## Integrate `bandit4mal` into `LastPyMile` 43 | `bandit4mal` is built using Python2 to scan both Python2 and Python3 code. So, **please use python2 without any virutal environment when installing bandit4mal**. We use `bandit4mal` to scan the discrepancy and report the alerts associated with the discrepancy. `bandit4mal` requires [pbr>=2.0.0](https://pypi.org/project/pbr/) 44 | - Go to [tools](tools/), run ```git clone https://github.com/lyvd/bandit4mal``` 45 | - Install `bandit4mal` by running this command ```sudo python2 setup.py install``` 46 | - The bandit program will be installed at the path ```/usr/local/bin/bandit``` (MacOS and Ubuntu) 47 | ## Usage 48 | 49 | To list all available options: 50 | ```bash 51 | python lastpymile.py -h 52 | ``` 53 | 54 | To scan a pacakge 55 | ```bash 56 | python lastpymile.py [:] 57 | ``` 58 | 59 | ## Limitations 60 | - Binary distributions (e.g., .exe, .dmg) are not supported 61 | - Packages that are not hosted on Github are not supported yet. 62 | 63 | ## Known Issues 64 | 65 | 66 | ## Todo (upcoming changes) 67 | - Improve the techniques for finding Github URLs of a PyPI package. We are working to integrate [py2src](https://github.com/simonepirocca/py2src) into LastPyMile. 68 | - Update the API documentation in the [docs](docs) directory 69 | 70 | 71 | ### How to obtain support 72 | Contact me at [ducly.vu@unint.it](mailto:ducly.vu@unint.it) or Twitter [@vuly16](https://twitter.com/vuly16) 73 | 74 | ### Contributing 75 | Open a Pull request at the repository in the [AssureMoss LastPyMile](https://github.com/assuremoss/lastpymile) 76 | 77 | ### Acknowledgement 78 | This work is partly funded by the EU under the H2020 research project 79 | [SPARTA](https://sparta.eu/) (Grant No.830892), 80 | [AssureMOSS](https://assuremoss.eu/) (Grant No.952647) and 81 | [CyberSec4Europe](https://cybersec4europe.eu/) (Grant No.830929). 82 | 83 | -------------------------------------------------------------------------------- /lastpymile/pypackage.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import logging 3 | import os,urllib 4 | import requests 5 | import json 6 | from urllib.parse import quote 7 | from lxml import html 8 | 9 | from lastpymile.utils import Utils 10 | 11 | class PyPackage: 12 | """ 13 | Class that represent a python package from pypi.org 14 | """ 15 | 16 | # __RELEASE_TYPE_WHEEL="wheel" 17 | # __RELEASE_TYPE_SOURCE="source" 18 | # __RELEASE_TYPE_EGG="egg" 19 | # __RELEASE_TYPE_UNKNOWN="unknown" 20 | 21 | __PYPI_URL="https://pypi.org" 22 | 23 | __logger=logging.getLogger("lastpymile.PyPackage") 24 | 25 | @staticmethod 26 | def getAllPackagesList() -> list[str]: 27 | """ 28 | Static method to retrieve all available packages from pypi.org 29 | 30 | Return (list[str]): 31 | A list of available packages names on pypi.org 32 | """ 33 | response = requests.get(PyPackage.__PYPI_URL+"/simple") 34 | tree = html.fromstring(response.content) 35 | package_list = [package for package in tree.xpath('//a/text()')] 36 | return package_list 37 | 38 | @staticmethod 39 | def searchPackage(package_name:str, package_version:str=None, checked:bool=False) -> PyPackage: 40 | """ 41 | Static method to create a PyPackage from its name and an optional version 42 | 43 | Parameters: 44 | package_name(str): The name of the package 45 | package_version(str): The version of the package. May be None, in that case the latest version is retrieved 46 | checked(bool): If True no exceptions are rasied if the pacakge cannot be found and None is returned. Default is False 47 | 48 | Return (PyPackage): 49 | The PyPackage object 50 | 51 | Raise (PyPackageNotFoundException): If the package couldn't be found 52 | """ 53 | safe_name=quote(package_name, safe='') 54 | safe_ver=quote(package_name, safe='') if package_version is not None else None 55 | partial_url="{}".format(safe_name) if package_version is None else "{}/{}".format(safe_name,safe_ver) 56 | url="{}/pypi/{}/json".format(PyPackage.__PYPI_URL,partial_url) 57 | PyPackage.__logger.debug("Downloading package '{}' data from {}".format(package_name,url)) 58 | try: 59 | return PyPackage(json.loads(Utils.getUrlContent(url))) 60 | except Exception as e: 61 | if checked==True: 62 | return None 63 | raise PyPackageNotFoundException(safe_name,safe_ver) from e 64 | 65 | def __init__(self,package_data) -> None: 66 | self.package_data=package_data 67 | self.name=self.package_data["info"]["name"] 68 | self.version=self.package_data["info"]["version"] 69 | self.releases=None 70 | self.git_repository_url=None 71 | 72 | def getName(self) -> str: 73 | """ 74 | Get the package name 75 | 76 | Return (str): 77 | the package name 78 | """ 79 | return self.name 80 | 81 | def getVersion(self): 82 | """ 83 | Get the package version 84 | 85 | Return (str): 86 | the package version 87 | """ 88 | return self.version 89 | 90 | def getRelaeses(self) -> list[PyPackageRelease]: 91 | """ 92 | Get all the available releases for the package 93 | 94 | Return (list): 95 | the package name 96 | """ 97 | if self.releases==None: 98 | self.__loadReleases() 99 | return self.releases 100 | 101 | def __loadReleases(self) -> None: 102 | """ 103 | Extract from the package metadata the list of available release files and store them in the self.releases variable 104 | """ 105 | self.releases=[] 106 | for release in self.package_data["releases"][self.version]: 107 | if "url" in release: 108 | self.releases.append(PyPackageRelease(self, release["url"],release["packagetype"] if "packagetype" in release else None)) 109 | 110 | def getGitRepositoryUrl(self) -> str: 111 | """ 112 | Get the package git repository url, if found 113 | 114 | Return (str): 115 | the package git repository url if found, otherwise None 116 | """ 117 | if self.git_repository_url==None: 118 | self.__loadSourcesRepository() 119 | return self.git_repository_url 120 | 121 | def __loadSourcesRepository(self): 122 | """ 123 | Scan the package metadata searching for a source git repository and stor the value in "self.git_repository_url" 124 | """ 125 | github_link=None 126 | urls=self.package_data["info"]["project_urls"] if "project_urls" in self.package_data["info"] else None 127 | 128 | if urls is not None: 129 | for link_name in urls: 130 | link=urls[link_name] 131 | if "github" in link and ( github_link == None or len(github_link) > len(link)): 132 | if github_link == None: 133 | github_link=link 134 | 135 | self.git_repository_url=github_link 136 | 137 | def __str__(self): 138 | return "PyPackage[name:{}, version:{}, github:{}, release:({}){}]".format(self.name,self.version,self.githubPageLink,self.releaseLink[1],self.releaseLink[0]) 139 | 140 | 141 | class PyPackageRelease(): 142 | """ 143 | Class that represent a python package release 144 | """ 145 | 146 | def __init__(self, pypackage:PyPackage ,url:str): 147 | self.pypackage=pypackage 148 | self.url=url 149 | 150 | def getPyPackage(self) -> PyPackage: 151 | """ 152 | Get the package owner of this release 153 | 154 | Return (PyPackage): 155 | the package owner of this release 156 | """ 157 | self.pypackage 158 | 159 | def getDownloadUrl(self) -> str: 160 | """ 161 | Get the relase download url 162 | 163 | Return (str): 164 | the relase download url 165 | """ 166 | return self.url 167 | 168 | def getReleaseFileName(self) -> str: 169 | """ 170 | Get the relase file name 171 | 172 | Return (str): 173 | the relase file name 174 | """ 175 | return os.path.basename(urllib.parse.urlparse(self.url).path) 176 | 177 | def getReleaseFileType(self) -> str: 178 | """ 179 | Get the relase file type (In practice the filename extension) 180 | 181 | Return (str): 182 | the the relase file type 183 | """ 184 | return self.getReleaseFileName().split(".")[-1] 185 | 186 | 187 | ################################## 188 | ## EXCEPTIONS 189 | ################################## 190 | 191 | class PyPackageNotFoundException(Exception): 192 | def __init__(self,package_name,package_version=None): 193 | if package_version is None: 194 | super().__init__("Py package '{}' not found".format(package_name)) 195 | else: 196 | super().__init__("Py package '{}' with version '{}' not found".format(package_name,package_version),False) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018-2021 University of Trento and LastPyMile contributors 2 | 3 | Apache License 4 | Version 2.0, January 2004 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, 12 | and distribution as defined by Sections 1 through 9 of this document. 13 | 14 | "Licensor" shall mean the copyright owner or entity authorized by 15 | the copyright owner that is granting the License. 16 | 17 | "Legal Entity" shall mean the union of the acting entity and all 18 | other entities that control, are controlled by, or are under common 19 | control with that entity. For the purposes of this definition, 20 | "control" means (i) the power, direct or indirect, to cause the 21 | direction or management of such entity, whether by contract or 22 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 23 | outstanding shares, or (iii) beneficial ownership of such entity. 24 | 25 | "You" (or "Your") shall mean an individual or Legal Entity 26 | exercising permissions granted by this License. 27 | 28 | "Source" form shall mean the preferred form for making modifications, 29 | including but not limited to software source code, documentation 30 | source, and configuration files. 31 | 32 | "Object" form shall mean any form resulting from mechanical 33 | transformation or translation of a Source form, including but 34 | not limited to compiled object code, generated documentation, 35 | and conversions to other media types. 36 | 37 | "Work" shall mean the work of authorship, whether in Source or 38 | Object form, made available under the License, as indicated by a 39 | copyright notice that is included in or attached to the work 40 | (an example is provided in the Appendix below). 41 | 42 | "Derivative Works" shall mean any work, whether in Source or Object 43 | form, that is based on (or derived from) the Work and for which the 44 | editorial revisions, annotations, elaborations, or other modifications 45 | represent, as a whole, an original work of authorship. For the purposes 46 | of this License, Derivative Works shall not include works that remain 47 | separable from, or merely link (or bind by name) to the interfaces of, 48 | the Work and Derivative Works thereof. 49 | 50 | "Contribution" shall mean any work of authorship, including 51 | the original version of the Work and any modifications or additions 52 | to that Work or Derivative Works thereof, that is intentionally 53 | submitted to Licensor for inclusion in the Work by the copyright owner 54 | or by an individual or Legal Entity authorized to submit on behalf of 55 | the copyright owner. For the purposes of this definition, "submitted" 56 | means any form of electronic, verbal, or written communication sent 57 | to the Licensor or its representatives, including but not limited to 58 | communication on electronic mailing lists, source code control systems, 59 | and issue tracking systems that are managed by, or on behalf of, the 60 | Licensor for the purpose of discussing and improving the Work, but 61 | excluding communication that is conspicuously marked or otherwise 62 | designated in writing by the copyright owner as "Not a Contribution." 63 | 64 | "Contributor" shall mean Licensor and any individual or Legal Entity 65 | on behalf of whom a Contribution has been received by Licensor and 66 | subsequently incorporated within the Work. 67 | 68 | 2. Grant of Copyright License. Subject to the terms and conditions of 69 | this License, each Contributor hereby grants to You a perpetual, 70 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 71 | copyright license to reproduce, prepare Derivative Works of, 72 | publicly display, publicly perform, sublicense, and distribute the 73 | Work and such Derivative Works in Source or Object form. 74 | 75 | 3. Grant of Patent License. Subject to the terms and conditions of 76 | this License, each Contributor hereby grants to You a perpetual, 77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 78 | (except as stated in this section) patent license to make, have made, 79 | use, offer to sell, sell, import, and otherwise transfer the Work, 80 | where such license applies only to those patent claims licensable 81 | by such Contributor that are necessarily infringed by their 82 | Contribution(s) alone or by combination of their Contribution(s) 83 | with the Work to which such Contribution(s) was submitted. If You 84 | institute patent litigation against any entity (including a 85 | cross-claim or counterclaim in a lawsuit) alleging that the Work 86 | or a Contribution incorporated within the Work constitutes direct 87 | or contributory patent infringement, then any patent licenses 88 | granted to You under this License for that Work shall terminate 89 | as of the date such litigation is filed. 90 | 91 | 4. Redistribution. You may reproduce and distribute copies of the 92 | Work or Derivative Works thereof in any medium, with or without 93 | modifications, and in Source or Object form, provided that You 94 | meet the following conditions: 95 | 96 | (a) You must give any other recipients of the Work or 97 | Derivative Works a copy of this License; and 98 | 99 | (b) You must cause any modified files to carry prominent notices 100 | stating that You changed the files; and 101 | 102 | (c) You must retain, in the Source form of any Derivative Works 103 | that You distribute, all copyright, patent, trademark, and 104 | attribution notices from the Source form of the Work, 105 | excluding those notices that do not pertain to any part of 106 | the Derivative Works; and 107 | 108 | (d) If the Work includes a "NOTICE" text file as part of its 109 | distribution, then any Derivative Works that You distribute must 110 | include a readable copy of the attribution notices contained 111 | within such NOTICE file, excluding those notices that do not 112 | pertain to any part of the Derivative Works, in at least one 113 | of the following places: within a NOTICE text file distributed 114 | as part of the Derivative Works; within the Source form or 115 | documentation, if provided along with the Derivative Works; or, 116 | within a display generated by the Derivative Works, if and 117 | wherever such third-party notices normally appear. The contents 118 | of the NOTICE file are for informational purposes only and 119 | do not modify the License. You may add Your own attribution 120 | notices within Derivative Works that You distribute, alongside 121 | or as an addendum to the NOTICE text from the Work, provided 122 | that such additional attribution notices cannot be construed 123 | as modifying the License. 124 | 125 | You may add Your own copyright statement to Your modifications and 126 | may provide additional or different license terms and conditions 127 | for use, reproduction, or distribution of Your modifications, or 128 | for any such Derivative Works as a whole, provided Your use, 129 | reproduction, and distribution of the Work otherwise complies with 130 | the conditions stated in this License. 131 | 132 | 5. Submission of Contributions. Unless You explicitly state otherwise, 133 | any Contribution intentionally submitted for inclusion in the Work 134 | by You to the Licensor shall be under the terms and conditions of 135 | this License, without any additional terms or conditions. 136 | Notwithstanding the above, nothing herein shall supersede or modify 137 | the terms of any separate license agreement you may have executed 138 | with Licensor regarding such Contributions. 139 | 140 | 6. Trademarks. This License does not grant permission to use the trade 141 | names, trademarks, service marks, or product names of the Licensor, 142 | except as required for reasonable and customary use in describing the 143 | origin of the Work and reproducing the content of the NOTICE file. 144 | 145 | 7. Disclaimer of Warranty. Unless required by applicable law or 146 | agreed to in writing, Licensor provides the Work (and each 147 | Contributor provides its Contributions) on an "AS IS" BASIS, 148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 149 | implied, including, without limitation, any warranties or conditions 150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 151 | PARTICULAR PURPOSE. You are solely responsible for determining the 152 | appropriateness of using or redistributing the Work and assume any 153 | risks associated with Your exercise of permissions under this License. 154 | 155 | 8. Limitation of Liability. In no event and under no legal theory, 156 | whether in tort (including negligence), contract, or otherwise, 157 | unless required by applicable law (such as deliberate and grossly 158 | negligent acts) or agreed to in writing, shall any Contributor be 159 | liable to You for damages, including any direct, indirect, special, 160 | incidental, or consequential damages of any character arising as a 161 | result of this License or out of the use or inability to use the 162 | Work (including but not limited to damages for loss of goodwill, 163 | work stoppage, computer failure or malfunction, or any and all 164 | other commercial damages or losses), even if such Contributor 165 | has been advised of the possibility of such damages. 166 | 167 | 9. Accepting Warranty or Additional Liability. While redistributing 168 | the Work or Derivative Works thereof, You may choose to offer, 169 | and charge a fee for, acceptance of support, warranty, indemnity, 170 | or other liability obligations and/or rights consistent with this 171 | License. However, in accepting such obligations, You may act only 172 | on Your own behalf and on Your sole responsibility, not on behalf 173 | of any other Contributor, and only if You agree to indemnify, 174 | defend, and hold each Contributor harmless for any liability 175 | incurred by, or claims asserted against, such Contributor by reason 176 | of your accepting any such warranty or additional liability. 177 | 178 | END OF TERMS AND CONDITIONS 179 | 180 | APPENDIX: How to apply the Apache License to your work. 181 | 182 | To apply the Apache License to your work, attach the following 183 | boilerplate notice, with the fields enclosed by brackets "[]" 184 | replaced with your own identifying information. (Don't include 185 | the brackets!) The text should be enclosed in the appropriate 186 | comment syntax for the file format. We also recommend that a 187 | file or class name and description of purpose be included on the 188 | same "printed page" as the copyright notice for easier 189 | identification within third-party archives. 190 | 191 | Copyright [yyyy] [name of copyright owner] 192 | 193 | Licensed under the Apache License, Version 2.0 (the "License"); 194 | you may not use this file except in compliance with the License. 195 | You may obtain a copy of the License at 196 | 197 | http://www.apache.org/licenses/LICENSE-2.0 198 | 199 | Unless required by applicable law or agreed to in writing, software 200 | distributed under the License is distributed on an "AS IS" BASIS, 201 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 202 | See the License for the specific language governing permissions and 203 | limitations under the License. 204 | -------------------------------------------------------------------------------- /lastpymile/abstractpackageanalysis.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from abc import ABC, abstractmethod 3 | 4 | import logging 5 | import os, tempfile 6 | import time 7 | from datetime import datetime 8 | from typing import Any 9 | 10 | from .utils import Utils 11 | from .pypackage import PyPackage, PyPackageRelease 12 | from .gitrepository import GitRepository 13 | 14 | class AbstractPackageAnalysis(ABC): 15 | """ 16 | Abstarct class that contains the general execution process of an analyis for an PyPackage. 17 | Mainly the analysis is diveded in 3 major step: 18 | 1- Sources scan: Sources are scanned and all required data for the sources is extrated/computed 19 | 2- Release scan: Release file is scanned and all required data for the release is extrated/computed 20 | 3- Analysis: sources data and release data are used to perform the actual analyisis. 21 | 22 | Methods listed below must be implemented: 23 | _isReleaseSupported(self,release): 24 | _checkPrerequisites(self,package:PyPackage) -> object: 25 | _scanSources(self,repository:GitRepository,stage_data:StageStatisticsData) -> object: 26 | _scanRelease(self,release:PyPackageRelease,stage_data:StageStatisticsData) -> object: 27 | _analyzeRelease(self,release:PyPackageRelease,source_data:object,release_data:object): 28 | """ 29 | 30 | def __init__(self, pyPackage:PyPackage, **options) -> None: 31 | self.pyPackage=pyPackage 32 | self.__logger=logging.getLogger("lastpymile."+type(self).__name__) 33 | self.__options=options 34 | self.__analysis_in_progress=False 35 | self._tmp_folder=None 36 | 37 | cache_folder=self._getOption("cache_folder",None) 38 | if cache_folder is not None: 39 | cache_folder=os.path.join(cache_folder,pyPackage.getName()+"_"+pyPackage.getVersion()) 40 | if not os.path.exists(cache_folder): 41 | os.makedirs(cache_folder) 42 | self._cache_folder=cache_folder 43 | 44 | def _getOption(self, name:str, default_value:Any=None) -> Any: 45 | """ 46 | Utility method to get a user option if defined, otherwise return the specified default value 47 | Parameters: 48 | name (str): name of the option 49 | default_value (Any): default value to return if the option is not specified (default:None) 50 | Retrun (Any): 51 | the specified option or the default value 52 | """ 53 | return self.__options[name] if name in self.__options else default_value 54 | 55 | def _getTempFolder(self) ->str: 56 | """ 57 | Get the current temporary folder. This method raise an exception if called otside an analyisis 58 | Retrun (str): 59 | the current temporary folder 60 | """ 61 | if self._tmp_folder is None: 62 | raise Exception("Invalid call to _getTempFolder") 63 | return self._tmp_folder 64 | 65 | def startAnalysis(self) -> map[str:Any]: 66 | """ 67 | Start the analysis of the package 68 | Retrun (map[str:Any]): 69 | a json serializable map that contain the results of the analysis 70 | """ 71 | 72 | if self.__analysis_in_progress==True: 73 | raise AnalysisException("Analysis already in progress") 74 | self.__analysis_in_progress=True 75 | 76 | analysis_report=AbstractPackageAnalysis.AnalysisReport(self.pyPackage) 77 | 78 | prerequisite_error=self._checkPrerequisites(self.pyPackage) 79 | if prerequisite_error is not None : 80 | if isinstance(prerequisite_error,str): 81 | self.__logger.critical(prerequisite_error) 82 | analysis_report.failed(prerequisite_error) 83 | return analysis_report.getReport() 84 | 85 | 86 | try: 87 | self._tmp_folder=self.__setupTempFolder(self._getOption("tmp_folder")) 88 | try: 89 | self.__doAnalysis(analysis_report) 90 | analysis_report.terminated() 91 | finally: 92 | if not self._getOption("keep_tmp_folder",False) and os.path.exists(self._getTempFolder()): 93 | self.__logger.debug("Deleting temp folder {}".format(self._getTempFolder())) 94 | Utils.rmtree(self._getTempFolder()) 95 | finally: 96 | self._tmpFolder=None 97 | self.__analysis_in_progress=False 98 | 99 | self.__logger.info("Package {} version:{} Analysis TERMINATED in {} seconds".format(self.pyPackage.getName(),self.pyPackage.getVersion(),analysis_report.getAnalysisDurationMs()/1000)) 100 | return analysis_report.getReport() 101 | 102 | def __doAnalysis(self, analysis_report:AnalysisReport) ->None: 103 | """ 104 | Internal method that pratically perform the analysis 105 | Parameters: 106 | analysis_report (AnalysisReport): AnalysisReport object used to store and organize the analyis result 107 | """ 108 | 109 | self.__logger.info("Package '{}' version:{} Analysis STARTED".format(self.pyPackage.getName(),self.pyPackage.getVersion())) 110 | 111 | releases=[] 112 | for release in self.pyPackage.getRelaeses(): 113 | if self._isReleaseSupported(release)==True: 114 | releases.append(release) 115 | 116 | if len(releases)==0: 117 | analysis_report.failed("No supported or selected releases found") 118 | return 119 | 120 | ### 121 | ### SOURCES PROCESSING 122 | ### 123 | try: 124 | self.__logger.info("Sources processing for package '{}' STARTED".format(self.pyPackage.getName(),self.pyPackage.getVersion())) 125 | stats_data=StageStatisticsData("processing_sources") 126 | sources_stage_data=self.__prepareSources(stats_data) 127 | stats_data.stageCompleted() 128 | analysis_report.addStatistics(stats_data) 129 | self.__logger.info("Sources processing for package '{}' TERMINATED".format(self.pyPackage.getName())) 130 | except AnalysisException as e: 131 | if self.__logger.isEnabledFor(logging.DEBUG): 132 | import traceback 133 | self.__logger.error("Sources processing for package '{}' TERMINATED with an ERROR:\n{}".format(self.pyPackage.getName(),traceback.format_exc())) 134 | else: 135 | self.__logger.error("Sources processing for package '{}' TERMINATED with an ERROR: {}".format(self.pyPackage.getName(),e)) 136 | analysis_report.failed(str(e)) 137 | return 138 | 139 | 140 | for release in releases: 141 | 142 | release_fileName=release.getReleaseFileName() 143 | try: 144 | self.__logger.info("Scan of release '{}' STARTED".format(release_fileName)) 145 | package_data=StageStatisticsData("package_{}".format(release.getReleaseFileName())) 146 | package_stage_data=self._scanRelease(release, package_data) 147 | package_data.stageCompleted() 148 | analysis_report.addStatistics(package_data) 149 | self.__logger.info("Analysis of release '{}' STARTED".format(release_fileName)) 150 | 151 | result=self._analyzeRelease(release,sources_stage_data,package_stage_data) 152 | analysis_report.addResult(result) 153 | 154 | self.__logger.info("Analysis of release '{}' TERMINATED".format(release_fileName)) 155 | 156 | except AnalysisException as e: 157 | if self.__logger.isEnabledFor(logging.DEBUG): 158 | import traceback 159 | self.__logger.error("Analysis of release '{}' TERMINATED with an AN ERROR:\n{}".format(release_fileName,traceback.format_exc())) 160 | else: 161 | self.__logger.error("Analysis of release '{}' TERMINATED with an AN ERROR: {}".format(release_fileName,e)) 162 | 163 | def __setupTempFolder(self, root_tmp_folder:str) -> str: 164 | """ 165 | Setup a temporary folder that is used during the analysis 166 | Parameters: 167 | root_tmp_folder (str): a path to a folder that is used as temporary folder. May be None, in that case the system temp folder is used 168 | Return (str): 169 | The path of the temporary folder created. (This folder may be seafly deleted after the analysis) 170 | """ 171 | if root_tmp_folder==None: 172 | tmp_folder= tempfile.mkdtemp() 173 | else: 174 | import time 175 | tmp_folder= os.path.join(root_tmp_folder,"lpm_"+(str(round(time.time() * 1000)).zfill(10))+"_"+Utils.sanitizeFolderName(self.pyPackage.getName(),20)+"_"+Utils.sanitizeFolderName(self.pyPackage.getVersion())) 176 | if os.path.exists(tmp_folder): 177 | Utils.rmtree(tmp_folder) 178 | os.makedirs(tmp_folder) 179 | 180 | self.__logger.info("Download folder set to {}".format(tmp_folder)) 181 | return tmp_folder 182 | 183 | @abstractmethod 184 | def _isReleaseSupported(self, release:PyPackageRelease) -> bool: 185 | """ 186 | Test if the specified release type is supported. If not supported the release is not processed 187 | This method mus be sublcassed 188 | Parameters: 189 | release (PyPackageRelease): the release object 190 | Return (bool): 191 | True if the release is supported, False otherwise 192 | """ 193 | return False 194 | 195 | def __prepareSources(self, statistics:StageStatisticsData) -> Any: 196 | """ 197 | Internal method that prepare the sources to be processed and call "_scanSources" 198 | Parameters: 199 | statistics (StageStatisticsData): object that can be used to report statistic data for the current analysis phase 200 | Return (Any): 201 | the object returned form _scanSources 202 | """ 203 | 204 | repository_fodler=self._getOption("repo_folder",None) 205 | clone_folder=os.path.join(self._getTempFolder(),"sources") 206 | 207 | if repository_fodler is None and self._cache_folder is not None: 208 | cached_repo_folder=os.path.join(self._cache_folder,"repo") 209 | if os.path.exists(cached_repo_folder): 210 | self.__logger.debug("Using chased repsoitory folder {}".format(cached_repo_folder)) 211 | repository_fodler=cached_repo_folder 212 | else: 213 | clone_folder=cached_repo_folder 214 | 215 | 216 | if repository_fodler is not None: 217 | repository=GitRepository.loadFromPath(repository_fodler) 218 | git_rep=repository_fodler 219 | else: 220 | git_url=self.pyPackage.getGitRepositoryUrl() 221 | if git_url is None: 222 | raise AnalysisException("Could not find a valid source repository") 223 | repository=GitRepository.cloneFromUrl(self.pyPackage.getGitRepositoryUrl(),clone_folder) 224 | git_rep=self.pyPackage.getGitRepositoryUrl() 225 | 226 | statistics.addStatistic("git_repository",git_rep) 227 | return self._scanSources(repository,statistics) 228 | 229 | @abstractmethod 230 | def _checkPrerequisites(self, package:PyPackage) -> str: 231 | """ 232 | Method called before the analysis start. Here all the prerequisites for the analysis are checked. 233 | This method mus be sublcassed 234 | Parameters: 235 | package (PyPackage): the current package that will be analyzed 236 | Return (str): 237 | An error message that describe the error which prevent the analysis execution 238 | """ 239 | pass 240 | 241 | @abstractmethod 242 | def _scanSources(self, repository:GitRepository, statistics:StageStatisticsData) -> Any: 243 | """ 244 | Abstract method where sources are scanned and prepocessed. This method shoud return an object that will be used in the next analysis phase (_analyzeRelease:source_data). 245 | This method mus be sublcassed 246 | Parameters: 247 | repository (GitRepository): a GitRepository object 248 | statistics (StageStatisticsData): object that can be used to report statistic data for the current analysis phase 249 | 250 | Return (Any): 251 | any object that can be used in the _analyzeRelease phase 252 | """ 253 | pass 254 | 255 | @abstractmethod 256 | def _scanRelease(self,release:PyPackageRelease, statistics:StageStatisticsData) -> Any: 257 | """ 258 | Abstract method where release file are aextracted and prepocessed. This method shoud return an object that will be used in the next analysis phase (_analyzeRelease:release_data). 259 | This method mus be sublcassed 260 | Parameters: 261 | release (PyPackageRelease): a PyPackageRelease object 262 | statistics (StageStatisticsData): object that can be used to report statistic data for the current analysis phase 263 | 264 | Return (map[str:ReleaseFileDescriptor]): 265 | any object that can be used in the _analyzeRelease phase 266 | """ 267 | pass 268 | 269 | @abstractmethod 270 | def _analyzeRelease(self,release:PyPackageRelease, source_data:Any, release_data:Any) -> map[str:Any]: 271 | """ 272 | Process the data from the previous phases and return a report 273 | Parameters: 274 | release (PyPackageRelease): the current release object that is analyzed 275 | source_data (Any): the data returned from the "_scanSources" method 276 | release_data (Any): the data returned from the "_scanRelease" method 277 | 278 | Return (map[str:Any]): 279 | A json serializable map containing the pakage anlaysis resutls data 280 | """ 281 | pass 282 | 283 | class AnalysisReport(): 284 | """ 285 | Conveninece class to store the analyis statistics and resutls 286 | """ 287 | 288 | def __init__(self,pyPackage): 289 | self.start_time=time.time() 290 | self.analysis_report={ 291 | "package":{"name":pyPackage.getName(),"version":pyPackage.getVersion()}, 292 | "date":datetime.now().strftime("%d/%m/%Y at %H:%M:%S.%f"), 293 | "duration_ms":"unknown", 294 | "completed":None, 295 | "fail_reason":None, 296 | "results":[], 297 | "statistics":[], 298 | } 299 | 300 | def failed(self,reason): 301 | self.analysis_report["completed"]=False 302 | self.analysis_report["fail_reason"]=reason 303 | 304 | def terminated(self): 305 | if self.analysis_report["completed"] is None: 306 | self.analysis_report["completed"]=True 307 | del self.analysis_report['fail_reason'] 308 | self.analysis_report["duration_ms"]=round((time.time()-self.start_time)*1000) 309 | 310 | def addStatistics(self,stage_data:StageStatisticsData): 311 | self.analysis_report["statistics"].append(stage_data.getStageStatistics()) 312 | 313 | def addResult(self,result): 314 | self.analysis_report["results"].append(result) 315 | 316 | def getAnalysisDurationMs(self): 317 | return self.analysis_report["duration_ms"] 318 | 319 | def getReport(self): 320 | report=dict(self.analysis_report) 321 | if len(report["results"])==0: 322 | del report["results"] 323 | if len(report["statistics"])==0: 324 | del report["statistics"] 325 | return report 326 | 327 | 328 | class StageStatisticsData(): 329 | """ 330 | Conveninece class usefult to store each analysis phase statistics data 331 | """ 332 | 333 | def __init__(self,stage_name): 334 | self.start_time=time.time() 335 | self.statistics={ 336 | "stage_name":stage_name, 337 | "duration_ms":"unknown" 338 | } 339 | 340 | def stageCompleted(self): 341 | self.statistics["duration_ms"]=round((time.time()-self.start_time)*1000) 342 | 343 | def addStatistic(self,name:str,value:object): 344 | self.statistics[name]=value 345 | 346 | def getStageStatistics(self): 347 | return self.statistics 348 | 349 | class AnalysisException(Exception): 350 | """ 351 | Exception class that wrap expection captured in this main class 352 | """ 353 | def __init__(self, message, trace_on_error=True): 354 | super().__init__(message) 355 | self.trace_on_error=trace_on_error 356 | 357 | def trace_on_error(self): 358 | return self.trace_on_error -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | [[package]] 2 | name = "certifi" 3 | version = "2021.10.8" 4 | description = "Python package for providing Mozilla's CA Bundle." 5 | category = "main" 6 | optional = false 7 | python-versions = "*" 8 | 9 | [[package]] 10 | name = "charset-normalizer" 11 | version = "2.0.10" 12 | description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." 13 | category = "main" 14 | optional = false 15 | python-versions = ">=3.5.0" 16 | 17 | [package.extras] 18 | unicode_backport = ["unicodedata2"] 19 | 20 | [[package]] 21 | name = "coloredlogs" 22 | version = "15.0.1" 23 | description = "Colored terminal output for Python's logging module" 24 | category = "main" 25 | optional = false 26 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 27 | 28 | [package.dependencies] 29 | humanfriendly = ">=9.1" 30 | 31 | [package.extras] 32 | cron = ["capturer (>=2.4)"] 33 | 34 | [[package]] 35 | name = "gitdb" 36 | version = "4.0.9" 37 | description = "Git Object Database" 38 | category = "main" 39 | optional = false 40 | python-versions = ">=3.6" 41 | 42 | [package.dependencies] 43 | smmap = ">=3.0.1,<6" 44 | 45 | [[package]] 46 | name = "gitpython" 47 | version = "3.1.25" 48 | description = "GitPython is a python library used to interact with Git repositories" 49 | category = "main" 50 | optional = false 51 | python-versions = ">=3.7" 52 | 53 | [package.dependencies] 54 | gitdb = ">=4.0.1,<5" 55 | 56 | [[package]] 57 | name = "humanfriendly" 58 | version = "10.0" 59 | description = "Human friendly output for text interfaces using Python" 60 | category = "main" 61 | optional = false 62 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 63 | 64 | [package.dependencies] 65 | pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""} 66 | 67 | [[package]] 68 | name = "idna" 69 | version = "3.3" 70 | description = "Internationalized Domain Names in Applications (IDNA)" 71 | category = "main" 72 | optional = false 73 | python-versions = ">=3.5" 74 | 75 | [[package]] 76 | name = "lxml" 77 | version = "4.7.1" 78 | description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." 79 | category = "main" 80 | optional = false 81 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" 82 | 83 | [package.extras] 84 | cssselect = ["cssselect (>=0.7)"] 85 | html5 = ["html5lib"] 86 | htmlsoup = ["beautifulsoup4"] 87 | source = ["Cython (>=0.29.7)"] 88 | 89 | [[package]] 90 | name = "pyreadline3" 91 | version = "3.3" 92 | description = "A python implementation of GNU readline." 93 | category = "main" 94 | optional = false 95 | python-versions = "*" 96 | 97 | [[package]] 98 | name = "requests" 99 | version = "2.27.1" 100 | description = "Python HTTP for Humans." 101 | category = "main" 102 | optional = false 103 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" 104 | 105 | [package.dependencies] 106 | certifi = ">=2017.4.17" 107 | charset-normalizer = {version = ">=2.0.0,<2.1.0", markers = "python_version >= \"3\""} 108 | idna = {version = ">=2.5,<4", markers = "python_version >= \"3\""} 109 | urllib3 = ">=1.21.1,<1.27" 110 | 111 | [package.extras] 112 | socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] 113 | use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"] 114 | 115 | [[package]] 116 | name = "smmap" 117 | version = "5.0.0" 118 | description = "A pure Python implementation of a sliding window memory map manager" 119 | category = "main" 120 | optional = false 121 | python-versions = ">=3.6" 122 | 123 | [[package]] 124 | name = "typing-extensions" 125 | version = "4.0.1" 126 | description = "Backported and Experimental Type Hints for Python 3.6+" 127 | category = "main" 128 | optional = false 129 | python-versions = ">=3.6" 130 | 131 | [[package]] 132 | name = "urllib3" 133 | version = "1.26.8" 134 | description = "HTTP library with thread-safe connection pooling, file post, and more." 135 | category = "main" 136 | optional = false 137 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" 138 | 139 | [package.extras] 140 | brotli = ["brotlipy (>=0.6.0)"] 141 | secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] 142 | socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] 143 | 144 | [metadata] 145 | lock-version = "1.1" 146 | python-versions = "^3.9" 147 | content-hash = "9a7f784372af83fc824c1419cdfbe86ff9572113facfb81062e52b641c655aaa" 148 | 149 | [metadata.files] 150 | certifi = [ 151 | {file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"}, 152 | {file = "certifi-2021.10.8.tar.gz", hash = "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872"}, 153 | ] 154 | charset-normalizer = [ 155 | {file = "charset-normalizer-2.0.10.tar.gz", hash = "sha256:876d180e9d7432c5d1dfd4c5d26b72f099d503e8fcc0feb7532c9289be60fcbd"}, 156 | {file = "charset_normalizer-2.0.10-py3-none-any.whl", hash = "sha256:cb957888737fc0bbcd78e3df769addb41fd1ff8cf950dc9e7ad7793f1bf44455"}, 157 | ] 158 | coloredlogs = [ 159 | {file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"}, 160 | {file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"}, 161 | ] 162 | gitdb = [ 163 | {file = "gitdb-4.0.9-py3-none-any.whl", hash = "sha256:8033ad4e853066ba6ca92050b9df2f89301b8fc8bf7e9324d412a63f8bf1a8fd"}, 164 | {file = "gitdb-4.0.9.tar.gz", hash = "sha256:bac2fd45c0a1c9cf619e63a90d62bdc63892ef92387424b855792a6cabe789aa"}, 165 | ] 166 | gitpython = [ 167 | {file = "GitPython-3.1.25-py3-none-any.whl", hash = "sha256:4f8b0e51713642b83f1eb10136d36a7f2c5278d188c5ae82b144cb1f4e848f0c"}, 168 | {file = "GitPython-3.1.25.tar.gz", hash = "sha256:b923e8952c2fe0d70b129fddd8511acc90b3070c8dddb20d86ca9911a5d0c248"}, 169 | ] 170 | humanfriendly = [ 171 | {file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"}, 172 | {file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"}, 173 | ] 174 | idna = [ 175 | {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, 176 | {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, 177 | ] 178 | lxml = [ 179 | {file = "lxml-4.7.1-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:d546431636edb1d6a608b348dd58cc9841b81f4116745857b6cb9f8dadb2725f"}, 180 | {file = "lxml-4.7.1-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6308062534323f0d3edb4e702a0e26a76ca9e0e23ff99be5d82750772df32a9e"}, 181 | {file = "lxml-4.7.1-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:f76dbe44e31abf516114f6347a46fa4e7c2e8bceaa4b6f7ee3a0a03c8eba3c17"}, 182 | {file = "lxml-4.7.1-cp27-cp27m-win32.whl", hash = "sha256:d5618d49de6ba63fe4510bdada62d06a8acfca0b4b5c904956c777d28382b419"}, 183 | {file = "lxml-4.7.1-cp27-cp27m-win_amd64.whl", hash = "sha256:9393a05b126a7e187f3e38758255e0edf948a65b22c377414002d488221fdaa2"}, 184 | {file = "lxml-4.7.1-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:50d3dba341f1e583265c1a808e897b4159208d814ab07530202b6036a4d86da5"}, 185 | {file = "lxml-4.7.1-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:44f552e0da3c8ee3c28e2eb82b0b784200631687fc6a71277ea8ab0828780e7d"}, 186 | {file = "lxml-4.7.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:e662c6266e3a275bdcb6bb049edc7cd77d0b0f7e119a53101d367c841afc66dc"}, 187 | {file = "lxml-4.7.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4c093c571bc3da9ebcd484e001ba18b8452903cd428c0bc926d9b0141bcb710e"}, 188 | {file = "lxml-4.7.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:3e26ad9bc48d610bf6cc76c506b9e5ad9360ed7a945d9be3b5b2c8535a0145e3"}, 189 | {file = "lxml-4.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:a5f623aeaa24f71fce3177d7fee875371345eb9102b355b882243e33e04b7175"}, 190 | {file = "lxml-4.7.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7b5e2acefd33c259c4a2e157119c4373c8773cf6793e225006a1649672ab47a6"}, 191 | {file = "lxml-4.7.1-cp310-cp310-win32.whl", hash = "sha256:67fa5f028e8a01e1d7944a9fb616d1d0510d5d38b0c41708310bd1bc45ae89f6"}, 192 | {file = "lxml-4.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:b1d381f58fcc3e63fcc0ea4f0a38335163883267f77e4c6e22d7a30877218a0e"}, 193 | {file = "lxml-4.7.1-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:38d9759733aa04fb1697d717bfabbedb21398046bd07734be7cccc3d19ea8675"}, 194 | {file = "lxml-4.7.1-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:dfd0d464f3d86a1460683cd742306d1138b4e99b79094f4e07e1ca85ee267fe7"}, 195 | {file = "lxml-4.7.1-cp35-cp35m-win32.whl", hash = "sha256:534e946bce61fd162af02bad7bfd2daec1521b71d27238869c23a672146c34a5"}, 196 | {file = "lxml-4.7.1-cp35-cp35m-win_amd64.whl", hash = "sha256:6ec829058785d028f467be70cd195cd0aaf1a763e4d09822584ede8c9eaa4b03"}, 197 | {file = "lxml-4.7.1-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:ade74f5e3a0fd17df5782896ddca7ddb998845a5f7cd4b0be771e1ffc3b9aa5b"}, 198 | {file = "lxml-4.7.1-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:41358bfd24425c1673f184d7c26c6ae91943fe51dfecc3603b5e08187b4bcc55"}, 199 | {file = "lxml-4.7.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:6e56521538f19c4a6690f439fefed551f0b296bd785adc67c1777c348beb943d"}, 200 | {file = "lxml-4.7.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5b0f782f0e03555c55e37d93d7a57454efe7495dab33ba0ccd2dbe25fc50f05d"}, 201 | {file = "lxml-4.7.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:490712b91c65988012e866c411a40cc65b595929ececf75eeb4c79fcc3bc80a6"}, 202 | {file = "lxml-4.7.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:34c22eb8c819d59cec4444d9eebe2e38b95d3dcdafe08965853f8799fd71161d"}, 203 | {file = "lxml-4.7.1-cp36-cp36m-win32.whl", hash = "sha256:2a906c3890da6a63224d551c2967413b8790a6357a80bf6b257c9a7978c2c42d"}, 204 | {file = "lxml-4.7.1-cp36-cp36m-win_amd64.whl", hash = "sha256:36b16fecb10246e599f178dd74f313cbdc9f41c56e77d52100d1361eed24f51a"}, 205 | {file = "lxml-4.7.1-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:a5edc58d631170de90e50adc2cc0248083541affef82f8cd93bea458e4d96db8"}, 206 | {file = "lxml-4.7.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:87c1b0496e8c87ec9db5383e30042357b4839b46c2d556abd49ec770ce2ad868"}, 207 | {file = "lxml-4.7.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:0a5f0e4747f31cff87d1eb32a6000bde1e603107f632ef4666be0dc065889c7a"}, 208 | {file = "lxml-4.7.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:bf6005708fc2e2c89a083f258b97709559a95f9a7a03e59f805dd23c93bc3986"}, 209 | {file = "lxml-4.7.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fc15874816b9320581133ddc2096b644582ab870cf6a6ed63684433e7af4b0d3"}, 210 | {file = "lxml-4.7.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:0b5e96e25e70917b28a5391c2ed3ffc6156513d3db0e1476c5253fcd50f7a944"}, 211 | {file = "lxml-4.7.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ec9027d0beb785a35aa9951d14e06d48cfbf876d8ff67519403a2522b181943b"}, 212 | {file = "lxml-4.7.1-cp37-cp37m-win32.whl", hash = "sha256:9fbc0dee7ff5f15c4428775e6fa3ed20003140560ffa22b88326669d53b3c0f4"}, 213 | {file = "lxml-4.7.1-cp37-cp37m-win_amd64.whl", hash = "sha256:1104a8d47967a414a436007c52f533e933e5d52574cab407b1e49a4e9b5ddbd1"}, 214 | {file = "lxml-4.7.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:fc9fb11b65e7bc49f7f75aaba1b700f7181d95d4e151cf2f24d51bfd14410b77"}, 215 | {file = "lxml-4.7.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:317bd63870b4d875af3c1be1b19202de34c32623609ec803b81c99193a788c1e"}, 216 | {file = "lxml-4.7.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:610807cea990fd545b1559466971649e69302c8a9472cefe1d6d48a1dee97440"}, 217 | {file = "lxml-4.7.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:09b738360af8cb2da275998a8bf79517a71225b0de41ab47339c2beebfff025f"}, 218 | {file = "lxml-4.7.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6a2ab9d089324d77bb81745b01f4aeffe4094306d939e92ba5e71e9a6b99b71e"}, 219 | {file = "lxml-4.7.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:eed394099a7792834f0cb4a8f615319152b9d801444c1c9e1b1a2c36d2239f9e"}, 220 | {file = "lxml-4.7.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:735e3b4ce9c0616e85f302f109bdc6e425ba1670a73f962c9f6b98a6d51b77c9"}, 221 | {file = "lxml-4.7.1-cp38-cp38-win32.whl", hash = "sha256:772057fba283c095db8c8ecde4634717a35c47061d24f889468dc67190327bcd"}, 222 | {file = "lxml-4.7.1-cp38-cp38-win_amd64.whl", hash = "sha256:13dbb5c7e8f3b6a2cf6e10b0948cacb2f4c9eb05029fe31c60592d08ac63180d"}, 223 | {file = "lxml-4.7.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:718d7208b9c2d86aaf0294d9381a6acb0158b5ff0f3515902751404e318e02c9"}, 224 | {file = "lxml-4.7.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:5bee1b0cbfdb87686a7fb0e46f1d8bd34d52d6932c0723a86de1cc532b1aa489"}, 225 | {file = "lxml-4.7.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:e410cf3a2272d0a85526d700782a2fa92c1e304fdcc519ba74ac80b8297adf36"}, 226 | {file = "lxml-4.7.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:585ea241ee4961dc18a95e2f5581dbc26285fcf330e007459688096f76be8c42"}, 227 | {file = "lxml-4.7.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a555e06566c6dc167fbcd0ad507ff05fd9328502aefc963cb0a0547cfe7f00db"}, 228 | {file = "lxml-4.7.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:adaab25be351fff0d8a691c4f09153647804d09a87a4e4ea2c3f9fe9e8651851"}, 229 | {file = "lxml-4.7.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:82d16a64236970cb93c8d63ad18c5b9f138a704331e4b916b2737ddfad14e0c4"}, 230 | {file = "lxml-4.7.1-cp39-cp39-win32.whl", hash = "sha256:59e7da839a1238807226f7143c68a479dee09244d1b3cf8c134f2fce777d12d0"}, 231 | {file = "lxml-4.7.1-cp39-cp39-win_amd64.whl", hash = "sha256:a1bbc4efa99ed1310b5009ce7f3a1784698082ed2c1ef3895332f5df9b3b92c2"}, 232 | {file = "lxml-4.7.1-pp37-pypy37_pp73-macosx_10_14_x86_64.whl", hash = "sha256:0607ff0988ad7e173e5ddf7bf55ee65534bd18a5461183c33e8e41a59e89edf4"}, 233 | {file = "lxml-4.7.1-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:6c198bfc169419c09b85ab10cb0f572744e686f40d1e7f4ed09061284fc1303f"}, 234 | {file = "lxml-4.7.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:a58d78653ae422df6837dd4ca0036610b8cb4962b5cfdbd337b7b24de9e5f98a"}, 235 | {file = "lxml-4.7.1-pp38-pypy38_pp73-macosx_10_14_x86_64.whl", hash = "sha256:e18281a7d80d76b66a9f9e68a98cf7e1d153182772400d9a9ce855264d7d0ce7"}, 236 | {file = "lxml-4.7.1-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8e54945dd2eeb50925500957c7c579df3cd07c29db7810b83cf30495d79af267"}, 237 | {file = "lxml-4.7.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:447d5009d6b5447b2f237395d0018901dcc673f7d9f82ba26c1b9f9c3b444b60"}, 238 | {file = "lxml-4.7.1.tar.gz", hash = "sha256:a1613838aa6b89af4ba10a0f3a972836128801ed008078f8c1244e65958f1b24"}, 239 | ] 240 | pyreadline3 = [ 241 | {file = "pyreadline3-3.3-py3-none-any.whl", hash = "sha256:0003fd0079d152ecbd8111202c5a7dfa6a5569ffd65b235e45f3c2ecbee337b4"}, 242 | {file = "pyreadline3-3.3.tar.gz", hash = "sha256:ff3b5a1ac0010d0967869f723e687d42cabc7dccf33b14934c92aa5168d260b3"}, 243 | ] 244 | requests = [ 245 | {file = "requests-2.27.1-py2.py3-none-any.whl", hash = "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"}, 246 | {file = "requests-2.27.1.tar.gz", hash = "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61"}, 247 | ] 248 | smmap = [ 249 | {file = "smmap-5.0.0-py3-none-any.whl", hash = "sha256:2aba19d6a040e78d8b09de5c57e96207b09ed71d8e55ce0959eeee6c8e190d94"}, 250 | {file = "smmap-5.0.0.tar.gz", hash = "sha256:c840e62059cd3be204b0c9c9f74be2c09d5648eddd4580d9314c3ecde0b30936"}, 251 | ] 252 | typing-extensions = [ 253 | {file = "typing_extensions-4.0.1-py3-none-any.whl", hash = "sha256:7f001e5ac290a0c0401508864c7ec868be4e701886d5b573a9528ed3973d9d3b"}, 254 | {file = "typing_extensions-4.0.1.tar.gz", hash = "sha256:4ca091dea149f945ec56afb48dae714f21e8692ef22a395223bcd328961b6a0e"}, 255 | ] 256 | urllib3 = [ 257 | {file = "urllib3-1.26.8-py2.py3-none-any.whl", hash = "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed"}, 258 | {file = "urllib3-1.26.8.tar.gz", hash = "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"}, 259 | ] 260 | -------------------------------------------------------------------------------- /lastpymile/maliciouscodepackageanalyzer.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import logging 3 | import os,tempfile 4 | from shutil import which 5 | from typing import Any, Tuple 6 | import zipfile, tarfile 7 | import hashlib 8 | from subprocess import Popen, PIPE 9 | 10 | from .utils import Utils 11 | from .abstractpackageanalysis import AbstractPackageAnalysis, StageStatisticsData, AnalysisException 12 | 13 | from .pypackage import * 14 | from .gitrepository import * 15 | from lastpymile import pypackage 16 | 17 | 18 | ### 19 | ### Internal support classes 20 | ### 21 | 22 | class FileDescriptor(): 23 | """ 24 | Abstract file descriptor, describing a general file. 25 | A file descriptor has a filename and chan be extended to implement the getContent() method 26 | """ 27 | 28 | def __init__(self,filename:str): 29 | self.filename=filename 30 | 31 | def getFileName(self) -> str: 32 | return self.filename.replace("\\","/") 33 | 34 | def getContent(): 35 | return None 36 | 37 | class GitFileDescriptor(FileDescriptor): 38 | 39 | def __init__(self, repository, commit_hexsha, filename): 40 | super().__init__(filename) 41 | self.repository=repository 42 | self.commit_hexsha=commit_hexsha 43 | 44 | def getCommitHexsha(self): 45 | return self.commit_hexsha 46 | 47 | def getContent(self): 48 | return self.repository.getCommitEntryContent(self.commit,self.filename) 49 | 50 | class ReleaseFileDescriptor(FileDescriptor): 51 | 52 | def __init__(self, dir, filename): 53 | super().__init__(filename) 54 | self.dir=dir 55 | 56 | def getFullFilePath(self): 57 | return os.path.join(self.dir,self.filename) 58 | 59 | def getContent(self): 60 | content=None 61 | with open(self.getFullFilePath(), "rb") as f: 62 | content=f.read() 63 | return content 64 | 65 | class ZipFileDescriptor(FileDescriptor): 66 | 67 | def __init__(self, zipFile, zip_info): 68 | super().__init__(zip_info.filename) 69 | self.zipFile=zipFile 70 | self.zip_info=zip_info 71 | 72 | def getContent(self): 73 | content=None 74 | try: 75 | content=self.zipFile.read(self.zip_info) 76 | except NotImplementedError: 77 | filename=self.zip_info.filename 78 | try: 79 | tmp_dir=tempfile.gettempdir() 80 | try: 81 | self.zip_info.filename = filename.split("/")[-1] 82 | self.zipFile.extract(self.zip_info,tmp_dir) 83 | with open(os.path.join(tmp_dir,self.zip_info.filename), "rb") as f: 84 | content=f.read() 85 | finally: 86 | try: 87 | os.remove(os.path.join(tmp_dir,self.zip_info.filename)) 88 | except: 89 | pass 90 | finally: 91 | self.zip_info.filename=filename 92 | 93 | 94 | return content 95 | 96 | class TarFileDescriptor(FileDescriptor): 97 | 98 | def __init__(self, tar, tar_info): 99 | super().__init__(tar_info.name) 100 | self.tar=tar 101 | self.tar_info=tar_info 102 | 103 | def getContent(self): 104 | content=None 105 | try: 106 | f=self.tar.extractfile(self.tar_info) 107 | if f is not None: 108 | content=f.read() 109 | except Exception: 110 | filename=self.tar_info.name 111 | try: 112 | tmp_dir=tempfile.gettempdir() 113 | try: 114 | self.tar_info.name = filename.split("/")[-1] 115 | self.tar.extract(self.tar_info,tmp_dir) 116 | with open(os.path.join(tmp_dir,self.tar_info.name), "rb") as f: 117 | content=f.read() 118 | finally: 119 | try: 120 | os.remove(os.path.join(tmp_dir,self.tar_info.name)) 121 | except: 122 | pass 123 | finally: 124 | self.tar_info.name=filename 125 | 126 | 127 | return content 128 | 129 | 130 | ### 131 | ### Internal support classes 132 | ### 133 | 134 | class MaliciousCodePackageAnalyzer(AbstractPackageAnalysis): 135 | """ 136 | Implementation class of an AbstractPackageAnalysis, that scan and search for malicious code injection in python packages 137 | """ 138 | 139 | __SUPPORTED_RELEASES_TYPES=["whl","zip","tar","gz","bz2","xz","egg"] 140 | 141 | __logger=logging.getLogger("lastpymile.MaliciousCodePackageAnalyzer") 142 | __report_logger=logging.getLogger("lastpymile_report") 143 | 144 | 145 | @classmethod 146 | def createAnaliysisForPackage(cls, package_name:str, package_version:str=None, checked:bool=False,**options) -> MaliciousCodePackageAnalyzer: 147 | """ 148 | Static method to create a MaliciousCodePackageAnalyzer object that can be used to analyze a pacakge 149 | Parameters: 150 | package_name (str): the name of the python package to analyze 151 | package_version (str): the version of the package. May be None, in that case the latest version is automatically chosen 152 | checked (bool): If True no exception is raised if the package cannot be found (In case of error the method return None). Default False 153 | 154 | Named options: 155 | tmp_folder (str): A path location that will be used as temporary folder. If None (default) the system temp folder is used 156 | repo_folder(str): A path location to a git repository that will be used as reference source repository. If None (default) the git repository it's deterimend and cloned from the package metadata 157 | 158 | The following options are mainly used during development or debugging 159 | 160 | keep_tmp_folder (bool): It True the temporary folder is not deleted - Default False 161 | cache_folder(str): A path location that will be used to store the downloaded artifacts and git repositories (to save bandwidth) 162 | cache_metadata_folder(str): A path location that will be used to store the package metadata info (to save bandwidth) 163 | 164 | Return (MaliciousCodePackageAnalyzer): 165 | A MaliciousCodePackageAnalyzer that can be used to analyze the requested package 166 | """ 167 | cls.__logger.info("Searching package '{}' version:{}".format(package_name,"" if package_version is None else package_version)) 168 | try: 169 | if "cache_metadata_folder" in options: 170 | cache_metadata_folder=options["cache_metadata_folder"] 171 | if not os.path.exists(cache_metadata_folder): 172 | os.makedirs(cache_metadata_folder) 173 | data_file=os.path.join(cache_metadata_folder,"{}_{}".format(package_name,package_version if package_version is not None else "LATEST")) 174 | 175 | if not os.path.exists(data_file): 176 | package_data=PyPackage._getPackageMetadata(package_name,package_version) 177 | with open(data_file,"w") as f: 178 | f.write(json.dumps(package_data)) 179 | else: 180 | MaliciousCodePackageAnalyzer.__logger.debug("Loading cashed package data {}".format(data_file)) 181 | with open(data_file, "rb") as f: 182 | package_data=json.loads(f.read()) 183 | 184 | pyPackage=PyPackage(package_data) 185 | else: 186 | pyPackage=PyPackage.searchPackage(package_name,package_version) 187 | cls.__logger.info("Package '{}' version:{} FOUND".format(pyPackage.getName(),pyPackage.getVersion())) 188 | 189 | return MaliciousCodePackageAnalyzer(pyPackage,**options) 190 | except PyPackageNotFoundException as e: 191 | cls.__logger.error("Package '{}' version:{} NOT FOUND {}".format(package_name,"" if package_version is None else package_version,e)) 192 | if checked==True: 193 | return None 194 | else: 195 | raise e 196 | 197 | def __init__(self, pyPackage:PyPackage, **options) -> None: 198 | super().__init__(pyPackage, **options) 199 | 200 | def _checkPrerequisites(self, package:PyPackage) -> str: 201 | """ 202 | Method called before the analysis start. Here all the prerequisites for the analysis are checked. 203 | Parameters: 204 | package (PyPackage): the current package that will be analyzed 205 | Return (str): 206 | An error message that describe the error which prevent the analysis execution 207 | """ 208 | if which("bandit") is None: 209 | return "Bandit is required but has not benn found!" 210 | 211 | def _isReleaseSupported(self, release:pypackage.PyPackageRelease) -> bool: 212 | """ 213 | Test if the specified release type is supported. If not supported the release is not processed 214 | Parameters: 215 | release (PyPackageRelease): the release object 216 | Return (bool): 217 | True if the release is supported, False otherwise 218 | """ 219 | return release.getReleaseFileType() in MaliciousCodePackageAnalyzer.__SUPPORTED_RELEASES_TYPES 220 | 221 | def __isProcessableFile(self, file_descriptor:FileDescriptor) -> bool: 222 | """ 223 | Test if the specified file is supported. If the method return False the file is ignored from the analysis 224 | Parameters: 225 | file_descriptor (FileDescriptor): a file descriptor object representing the file to test 226 | Return (bool): 227 | True if the file is supported, False otherwise 228 | """ 229 | return file_descriptor.getFileName().endswith(".py") 230 | 231 | def _scanSources(self, repository:GitRepository, statistics:StageStatisticsData) -> map[str:GitFileDescriptor]: 232 | """ 233 | Scan the sources file from the git repository, and return an object that will be used in the next analysis phase (_analyzeRelease:source_data). 234 | In particular, scan all the files and commits in the repository and build a map of [file_hash,file] 235 | Parameters: 236 | repository (GitRepository): a GitRepository object 237 | statistics (StageStatisticsData): object that can be used to report statistic data for the current analysis phase 238 | 239 | Return (map[str:GitFileDescriptor]): 240 | A map of [file_hash,file] 241 | """ 242 | source_files_hashes={} 243 | commits=repository.getCommitsList() 244 | commits_len=len(commits) 245 | processed_files=0 246 | i=1 247 | for commit_hash in commits: 248 | self.__logger.debug("Processing commit {}/{} ({})".format(i,commits_len,commit_hash)) 249 | i+=1 250 | commit=repository.checkoutCommit(commit_hash) 251 | files_at_commit=repository.getFilesAtCommit(commit) 252 | for cmt_file in commit.stats.files: 253 | if cmt_file not in files_at_commit:##File has been deleted 254 | continue 255 | git_fd=GitFileDescriptor(repository,commit.hexsha,cmt_file) 256 | if self.__isProcessableFile(git_fd): 257 | file_hash=self.__computeFileHash(os.path.join(repository.getRepositoryFolder(),cmt_file)) 258 | source_files_hashes[file_hash]=git_fd 259 | processed_files+=1 260 | 261 | statistics.addStatistic("processed_commits",commits_len) 262 | statistics.addStatistic("processed_files",processed_files) 263 | return source_files_hashes 264 | 265 | def _scanRelease(self, release:PyPackageRelease, statistics:StageStatisticsData) -> map[str:ReleaseFileDescriptor]: 266 | """ 267 | Downlaod and scan the release file, and return an object that will be used in the next analysis phase (_analyzeRelease:release_data). 268 | In particular, extract the release fial and build a map of all supported files [file_hash,file] 269 | Parameters: 270 | release (PyPackageRelease): a PyPackageRelease object 271 | statistics (StageStatisticsData): object that can be used to report statistic data for the current analysis phase 272 | 273 | Return (map[str:ReleaseFileDescriptor]): 274 | A map of [file_hash,file] 275 | """ 276 | release_file_name=release.getReleaseFileName() 277 | 278 | if self._cache_folder is not None: 279 | destFile=os.path.join(self._cache_folder,release_file_name) 280 | else: 281 | destFile=os.path.join(self._getTempFolder(),release_file_name) 282 | 283 | if not os.path.exists(destFile): 284 | try: 285 | self.__logger.debug("Downloading release file {} to {}".format(release_file_name,destFile)) 286 | Utils.downloadUrl(release.getDownloadUrl(),destFile) 287 | except Exception as e: 288 | raise AnalysisException("Unable to download release file content") from e 289 | else: 290 | self.__logger.debug("Using cashed release file {}".format(destFile)) 291 | 292 | extract_folder=os.path.join(self._getTempFolder(),"release__"+release_file_name+"___"+release.getReleaseFileType()) 293 | self.__logger.debug("Extracting release file {} to {}".format(release_file_name,extract_folder)) 294 | file_count=self.__extractReleaseFile(release.getReleaseFileName(),destFile,extract_folder) 295 | statistics.addStatistic("processed_files",file_count) 296 | 297 | return self.__collectFilesHashes(extract_folder) 298 | 299 | def __extractReleaseFile(self, release_file_name:str, release_archive_file:str, extract_folder:str) -> int: 300 | """ 301 | Extract all supported files from the release archive 302 | Parameters: 303 | release_file_name (str): the name of the release archive 304 | release_archive_file (str): path of the downlaoded archive file 305 | extract_folder (str): path where the release archive is extracted 306 | 307 | Return (int): 308 | The number of extracted files 309 | """ 310 | try: 311 | ext=release_archive_file.split('.')[-1] 312 | if ext=="whl" : 313 | return self.__extractZip(release_archive_file,extract_folder) 314 | elif ext=="zip": 315 | return self.__extractZip(release_archive_file,extract_folder) 316 | elif ext=="tar": 317 | return self.__extractTar(release_archive_file,extract_folder) 318 | elif ext=="gz": 319 | return self.__extractTar(release_archive_file,extract_folder,"gz") 320 | elif ext=="bz2": 321 | return self.__extractTar(release_archive_file,extract_folder,"bz2") 322 | elif ext=="xz": 323 | return self.__extractTar(release_archive_file,extract_folder,"xz") 324 | elif ext=="egg": 325 | return self.__extractTar(release_archive_file,extract_folder,"xz") 326 | 327 | except Exception as e: 328 | raise AnalysisException("Unable to extract release file content of release {}".format(release_file_name)) from e 329 | 330 | def __extractZip(self, archive_file:str, extract_folder:str) -> int: 331 | """ 332 | Extract all supported file in the specified release zip file into the specified extract folder 333 | Parameters: 334 | archive_file (str): path of the archive file 335 | extract_folder (str): path where the release archive is extracted 336 | 337 | Return (int): 338 | The number of extracted files 339 | """ 340 | file_count=0 341 | with zipfile.ZipFile(open(archive_file, 'rb')) as fzip: 342 | for zip_info in fzip.infolist(): 343 | if not zip_info.is_dir(): 344 | fd=ZipFileDescriptor(fzip,zip_info) 345 | if self.__isProcessableFile(fd): 346 | fzip.extract(zip_info, extract_folder) 347 | file_count+=1 348 | return file_count 349 | 350 | def __extractTar(self, archive_file:str, extract_folder, mode:str=None): 351 | """ 352 | Extract all supported file in the specified release tar file into the specified extract folder 353 | Parameters: 354 | archive_file (str): path of the archive file 355 | extract_folder (str): path where the release archive is extracted 356 | mode (str): optional argument that indicate which mode must be used to open the tar archive. See: https://docs.python.org/3/library/tarfile.html 357 | 358 | Return (int): 359 | The number of extracted files 360 | """ 361 | file_count=0 362 | with tarfile.open(archive_file, mode="r"+ (":"+mode if mode is not None else "")) as tar: 363 | for tar_info in tar.getmembers(): 364 | if tar_info.isfile(): 365 | fd=TarFileDescriptor(tar,tar_info) 366 | if self.__isProcessableFile(fd): 367 | tar.extract(tar_info, extract_folder) 368 | file_count+=1 369 | return file_count 370 | 371 | def __collectFilesHashes(self, folder:str) -> map[str:ReleaseFileDescriptor]: 372 | """ 373 | Recursively scan all files in the specified folder and compute its hash 374 | Parameters: 375 | folder (str): path of the folder to scan 376 | 377 | Return (int): 378 | The number of extracted files 379 | """ 380 | file_hashes={} 381 | for path, subdirs, files in os.walk(folder): 382 | for name in files: 383 | full_file_path=os.path.join(path, name) 384 | relative_file_path=os.path.join(os.path.relpath(path,folder), name) 385 | file_hash=self.__computeFileHash(full_file_path) 386 | # with open(full_file_path, 'rb', buffering=0) as f: 387 | # file_hash=self.__computeHash(f) 388 | file_hashes[file_hash]=ReleaseFileDescriptor(folder,relative_file_path) 389 | return file_hashes 390 | 391 | def _analyzeRelease(self,release:PyPackageRelease, source_data:Any, release_data:Any) ->map[str:Any]: 392 | """ 393 | Search for phantom files (files that are not found in the git repository) and process them with the bandit4mal tool to found potentially dangerous code 394 | Parameters: 395 | release (PyPackageRelease): the current release object that is analyzed 396 | source_data (map[str:GitFileDescriptor]): the data returned from the "_scanSources" method 397 | release_data (map[str:ReleaseFileDescriptor]): the data returned from the "_scanRelease" method 398 | 399 | Return (map[str:Any]): 400 | A json serializable map containing the pakage anlayisi resutls data 401 | """ 402 | result={ 403 | "release":release.getReleaseFileName(), 404 | "status":None, 405 | "coherent_files":[], 406 | "phantom_files":[], 407 | "low_risk_files":[], 408 | "medium_risk_files":[], 409 | "high_risk_files":[], 410 | } 411 | 412 | for release_hash in release_data: 413 | rel_fd=release_data[release_hash] 414 | file_name=rel_fd.getFileName() 415 | 416 | if release_hash not in source_data: 417 | 418 | risk_level,report=self.__banditCheck(rel_fd.getFullFilePath()) 419 | file_result={ 420 | "file":file_name, 421 | "file_hash":release_hash, 422 | "bandit_report":report, 423 | } 424 | if risk_level==0: 425 | result["phantom_files"].append(file_result) 426 | self.__report_logger.info("Found a phanthom phantom file '{}' in release file {}".format(file_name,release.getReleaseFileName())) 427 | elif risk_level==1: 428 | result["low_risk_files"].append(file_result) 429 | self.__report_logger.warn("Found a LOW risk phantom file '{}' in release file {}".format(file_name,release.getReleaseFileName())) 430 | elif risk_level==2: 431 | result["medium_risk_files"].append(file_result) 432 | self.__report_logger.error("Found a MEDIUM risk phantom file '{}' in release file {}".format(file_name,release.getReleaseFileName())) 433 | else: 434 | result["high_risk_files"].append(file_result) 435 | self.__report_logger.critical("Found a HIGH risk phantom file '{}' in release file {}".format(file_name,release.getReleaseFileName())) 436 | 437 | else: 438 | src_fd=source_data[release_hash] 439 | result["coherent_files"].append({ 440 | "file":file_name, 441 | "file_hash":release_hash, 442 | "commit_hash":src_fd.getCommitHexsha(), 443 | "commit_file":src_fd.getFileName(), 444 | }) 445 | self.__report_logger.info("File '{}' in release file {} is coherent".format(file_name,release.getReleaseFileName())) 446 | 447 | if len(result["high_risk_files"])>0: 448 | status="critic" 449 | elif len(result["medium_risk_files"])>0: 450 | status="danger" 451 | if len(result["low_risk_files"])>0: 452 | status="warning" 453 | elif len(result["phantom_files"])>0: 454 | status="stable" 455 | else: 456 | status="coherent" 457 | 458 | result["status"]=status 459 | return result 460 | 461 | def __computeFileHash(self, file_name:str) -> str: 462 | """ 463 | Compute a SHA-512 hash for the sepcified file 464 | Parameters: 465 | file_name (str): a path pointing to the file whose hash has to be calculated 466 | Return (str): 467 | The file hash 468 | """ 469 | # TODO: Should this be threaded??? 470 | h = hashlib.sha512() 471 | b = bytearray(128*1024) 472 | mv = memoryview(b) 473 | with open(file_name, 'rb', buffering=0) as f: 474 | for n in iter(lambda : f.readinto(mv), 0): 475 | h.update(mv[:n]) 476 | return h.hexdigest() 477 | 478 | def __computeStreamHash(self, stream) ->str: 479 | """ 480 | Compute a SHA-512 hash for the sepcified stream 481 | Parameters: 482 | stream (???): the stream object whose hash has to be calculated 483 | Return (str): 484 | The file hash 485 | """ 486 | BUF_SIZE = 65536 # lets read stuff in 64kb chunks! 487 | alg=hashlib.sha512() 488 | while True: 489 | data = stream.read(BUF_SIZE) 490 | if not data: 491 | break 492 | alg.update(data) 493 | return alg.hexdigest() 494 | 495 | def __banditCheck(self, file:str) -> Tuple[int,map[str:Any]]: 496 | """ 497 | Launch the bandit analysis on the specified file. 498 | Parameters: 499 | file (str): path of the file to analyze with bandit 500 | Return (map[str:Any]): 501 | A json serializable map containing the bandit's file scan results 502 | """ 503 | self.__logger.debug("Bandit analysis of file {}".format(file)) 504 | proc = Popen(["bandit", file, "-q","-f", "json"], stdout=PIPE, stderr=PIPE) 505 | output, _ = proc.communicate() 506 | report=json.loads(output.decode("utf-8")) 507 | 508 | result=[] 509 | risk_level=0 510 | if len(report["results"]) > 0: 511 | report_results_allowed_keys=["test_id","test_name","issue_confidence","issue_severity", "issue_text","line_number","line_range","code"] 512 | for report_result in report["results"]: 513 | if "issue_severity" in report_result: 514 | if report_result["issue_severity"].upper() == "LOW": 515 | rl=1 516 | elif report_result["issue_severity"].upper() == "MEDIUM": 517 | rl=2 518 | elif report_result["issue_severity"].upper() == "HIGH": 519 | rl=3 520 | risk_level=max(risk_level,rl) 521 | res={} 522 | for key in report_results_allowed_keys: 523 | if key in report_result: 524 | res[key]=report_result[key] 525 | result.append(res) 526 | return risk_level, result 527 | 528 | --------------------------------------------------------------------------------