diff --git a/src/parsing/cmake_parser.rs b/src/parsing/cmake_parser.rs index 1e92c0a..999589c 100644 --- a/src/parsing/cmake_parser.rs +++ b/src/parsing/cmake_parser.rs @@ -112,7 +112,7 @@ impl CMakeParser { arg_dict.insert(pair[0].clone(), pair[1].clone()); } if let Some(dangling) = groups.chunks_exact(2).remainder().first() { - eprintln!("Warning: Dangling key without value: {}", dangling); + eprintln!("Warning: Dangling key without value: {dangling}"); } if arg_dict.contains_key("URL") { diff --git a/src/parsing/python_parser.rs b/src/parsing/python_parser.rs index 9a6a52f..9ab16aa 100644 --- a/src/parsing/python_parser.rs +++ b/src/parsing/python_parser.rs @@ -10,12 +10,14 @@ use std::{fs, mem}; use streaming_iterator::StreamingIterator; use rusqlite::params; -use tree_sitter::{Parser, Query, QueryCursor}; +use tree_sitter::{Parser, Query, QueryCapture, QueryCursor}; use super::parser::{par_file_iter, LibProcessor}; use super::parser::{LangInclude, LibParser, SourceFinder}; use crate::dataset::database::Database; +use crate::parsing::bash_parser; +use crate::parsing::parser::SystemProgram; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum PythonImport { @@ -358,6 +360,31 @@ lazy_static::lazy_static! { ].into_iter().collect(); } +lazy_static::lazy_static! { + static ref PYTHON_SYS_CALL_QUERY: Query = Query::new( + &tree_sitter_python::LANGUAGE.into(), + r#" + ; 1) free‐standing calls: foo(arg1, arg2) + (call + function: (identifier) @function_name + arguments: (argument_list (expression) @arg_list + ) + ) + + ; 2) single‐module calls: os.system("…") + (call + function: (attribute + object: (identifier) @module + attribute: (identifier) @function_name + ) + arguments: (argument_list + (expression) @arg_list + ) + ) + "# + ).expect("Error creating query"); +} + impl<'db> PythonParser<'db> { pub fn new(package_database: &'db Database, os_database: &'db Database) -> Self { PythonParser { @@ -450,13 +477,117 @@ impl<'db> PythonParser<'db> { imports } - fn process_files(&self, file_paths: T) -> HashMap>> + fn is_likely_syscall(module: &str, func: &str) -> bool { + let combined = format!("{module}.{func}"); + let predefined = ["os.system", "subprocess.run"]; + + predefined.contains(&combined.as_str()) + } + + pub fn extract_sys_calls(file_path: &Path) -> HashSet { + let mut calls = HashSet::new(); // variable to hold the final grouping of calls + let source_code = match fs::read_to_string(file_path) // read the file into a string + { + Ok(content) => content, + Err(e) => + { + eprintln!("Error reading {}: {}", file_path.to_str().unwrap(), e); + return calls; + } + }; + + // parse with tree-sitter + let mut parser = Parser::new(); // create a new parser + parser + .set_language(&tree_sitter_python::LANGUAGE.into()) // set the parser language + .expect("Error loading Python grammar"); + let tree = parser.parse(&source_code, None).unwrap(); // create a tree + let root = tree.root_node(); // set the root node + + let mut query_cursor = QueryCursor::new(); // object to query the tree + let mut matches = + query_cursor.matches(&PYTHON_SYS_CALL_QUERY, root, source_code.as_bytes()); // look for matches in the src file as bytes + + while let Some(m) = matches.next() + // loop to process each match that is found + { + // capture slots + let mut func_name: Option = None; // variable to hold the function name + let mut args_node = None; // variable to hold the args + let mut module_name = None; // variable to hold the combined module and function name + + for QueryCapture { node, index, .. } in m.captures + // for loop to loop over the matches + { + let capture_name = PYTHON_SYS_CALL_QUERY.capture_names()[*index as usize]; // represents the current capture + match capture_name // set the func_name and args_node variables to what was in the capture + { + "function_name" => + { + if let Ok(t) = node.utf8_text(source_code.as_bytes()) + { + func_name = Some(t.to_string()); + } + } + "arg_list" => + { + args_node = Some(node); + } + "module" => + { + if let Ok(t) = node.utf8_text(source_code.as_bytes()) + { + module_name = Some(t.to_string()); + } + } + _ => {} + } + } + + if let (Some(f), Some(arg_list_node)) = (func_name, args_node) + // check if both variables are not None + { + // println!("func_name = {} ", f); + // println!("module_name = {:?} ", module_name); + + if let Some(ref module) = module_name { + if !Self::is_likely_syscall(module, &f) { + continue; // not a system call we're interested in at the moment, so skip analysis + } + + let mut stack = vec![*arg_list_node]; + while let Some(node) = stack.pop() { + if node.kind() == "string" { + if let Ok(raw) = node.utf8_text(source_code.as_bytes()) { + let cleaned = + raw.trim_matches('"').trim_matches('\'').replace('\n', " "); + if let Some(cmd) = bash_parser::parse_bash_command(&cleaned) { + calls.insert(LangInclude::OS(SystemProgram::Application(cmd))); + } + } + continue; // we've handled this node; don't also walk its children + } + + let mut child_cursor = node.walk(); + for child in node.children(&mut child_cursor) { + stack.push(child); + } + } + } + } + } + + calls + } + + fn process_files(&self, file_paths: T) -> HashMap>> where T: IntoIterator, T::Item: AsRef, { //Using Rayon for parallel processing associates wrapping set with Mutex for synchronization let global_imports: Mutex> = Mutex::new(HashSet::new()); + let global_sys_calls: Mutex> = Mutex::new(HashSet::new()); par_file_iter(file_paths, |file_path| { let file_includes = Self::extract_includes(file_path); @@ -464,6 +595,14 @@ impl<'db> PythonParser<'db> { for include in file_includes { global_includes.insert(include); } + // syscalls + let file_sys_calls = Self::extract_sys_calls(file_path); + { + let mut g = global_sys_calls.lock().unwrap(); + for sc in file_sys_calls { + g.insert(sc); + } + } }); //Prepare SQL for database query @@ -517,7 +656,35 @@ impl<'db> PythonParser<'db> { } } - global_import_map + // Map collected syscalls to packages + let global_sys_calls = mem::take(&mut *global_sys_calls.lock().unwrap()); + let mut syscall_map: HashMap>> = HashMap::new(); + + for call in global_sys_calls.into_iter() { + let func_name = match &call { + LangInclude::OS(SystemProgram::Application(cmd)) => cmd.to_lowercase(), + _ => continue, + }; + + if let Ok(libs) = query_db(&func_name) { + // Uncomment below if you only want to return strings that appear in the database + // if !libs.is_empty() { + // syscall_map.insert(call, vec![libs]); + // } + syscall_map.insert(call, vec![libs]); + } + } + + // Merge both maps into the required return type + let mut result: HashMap>> = HashMap::new(); + for (import, libs) in global_import_map { + result.insert(LangInclude::Python(import), libs); + } + for (call, libs) in syscall_map { + result.insert(call, libs); + } + + result } } @@ -533,13 +700,11 @@ impl LibParser for PythonParser<'_> { .collect() } - fn extract_sys_calls(_file_path: &Path) -> HashSet + fn extract_sys_calls(file_path: &Path) -> HashSet where Self: Sized, { - //Argument _file_path prefixed with underscore to prevent complaints from cargo - //Rename to "file_path" when implementing - HashSet::new() + Self::extract_sys_calls(file_path) } } @@ -549,10 +714,9 @@ impl LibProcessor for PythonParser<'_> { T: IntoIterator, T::Item: AsRef, { - // fn process_files(&self, file_path: Vec<&str>) -> Vec<(LangInclude, Vec)>{ self.process_files(file_paths) .into_iter() - .map(|(python_include, vec)| (LangInclude::Python(python_include), vec)) + //.map(|(python_include, vec)| (python_include, vec)) .collect() } } @@ -638,4 +802,35 @@ mod tests { .collect(); assert_eq!(from_alias_imports, exp_from_alias_imports); } + + #[test] + fn test_extract_python_syscalls() { + let test_file = Path::new("tests/test_files/test_sys_calls.py"); + // Run the extractor + let calls = PythonParser::extract_sys_calls(test_file); + + // Collect only OS application names from the result set + let mut found: HashSet = HashSet::new(); + for call in calls { + if let LangInclude::OS(SystemProgram::Application(name)) = call { + found.insert(name); + } + } + + // Positive cases + assert!( + found.contains("ls"), + "Expected to find 'ls' from os.system(\"ls -l /tmp\")" + ); + assert!( + found.contains("echo"), + "Expected to find 'echo' from subprocess.run(\"echo hello\", shell=True)" + ); + + // Negative case: free-standing run(...) is not in the allowed (module.func) list + assert!( + !found.contains("rm"), + "Did not expect to match free-standing run(\"rm ...\")" + ); + } } diff --git a/tests/test_files/test_sys_calls.py b/tests/test_files/test_sys_calls.py new file mode 100644 index 0000000..56d0c67 --- /dev/null +++ b/tests/test_files/test_sys_calls.py @@ -0,0 +1,31 @@ +import os +import subprocess +import subprocess as sp +from subprocess import run as sproot_run + + +def positives(): + # Should match: "os.system" + os.system("ls -l /tmp") + + # Should match: "os.system" + os.system("echo from os.system") + + # Should match: "subprocess.run" + subprocess.run(["echo", "from subprocess.run"]) + + + # Depending on your resolver, these MAY or MAY NOT match + # (aliases/from-imports). Keep them if you want extra coverage. + sp.run("echo from sp.run", shell=True) # alias of subprocess + sproot_run(["echo", "from from-import"]) # from-import alias + +def negatives(): + # Should NOT match (not in the allowlist) + subprocess.check_call(["echo", "should NOT match"]) + os.path.join("a", "b") # not a syscall + run("rm -rf /tmp/somewhere") + +if __name__ == "__main__": + positives() + negatives() \ No newline at end of file