From 0e9aa28d759d2b0e1c7ae943c3cbdea5be4f0069 Mon Sep 17 00:00:00 2001 From: Micaela Date: Wed, 1 Oct 2025 15:31:39 -0700 Subject: [PATCH 1/7] feat: add system call identification for python code --- src/parsing/python_parser.rs | 234 +++++++++++++++++++++++++++++++++-- 1 file changed, 225 insertions(+), 9 deletions(-) diff --git a/src/parsing/python_parser.rs b/src/parsing/python_parser.rs index 9a6a52f..c2527f8 100644 --- a/src/parsing/python_parser.rs +++ b/src/parsing/python_parser.rs @@ -10,12 +10,14 @@ use std::{fs, mem}; use streaming_iterator::StreamingIterator; use rusqlite::params; -use tree_sitter::{Parser, Query, QueryCursor}; +use tree_sitter::{Parser, Query, QueryCapture, QueryCursor}; use super::parser::{par_file_iter, LibProcessor}; use super::parser::{LangInclude, LibParser, SourceFinder}; use crate::dataset::database::Database; +use crate::parsing::bash_parser; +use crate::parsing::parser::SystemProgram; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum PythonImport { @@ -358,6 +360,31 @@ lazy_static::lazy_static! { ].into_iter().collect(); } +lazy_static::lazy_static! { + static ref PYTHON_SYS_CALL_QUERY: Query = Query::new( + &tree_sitter_python::LANGUAGE.into(), + r#" + ; 1) free‐standing calls: foo(arg1, arg2) + (call + function: (identifier) @function_name + arguments: (argument_list (expression) @arg_list + ) + ) + + ; 2) single‐module calls: os.system("…") + (call + function: (attribute + object: (identifier) @module + attribute: (identifier) @function_name + ) + arguments: (argument_list + (expression) @arg_list + ) + ) + "# + ).expect("Error creating query"); +} + impl<'db> PythonParser<'db> { pub fn new(package_database: &'db Database, os_database: &'db Database) -> Self { PythonParser { @@ -450,13 +477,137 @@ impl<'db> PythonParser<'db> { imports } - fn process_files(&self, file_paths: T) -> HashMap>> + fn is_likely_syscall(module: &str, func: &str) -> bool + { + let combined = format!("{}.{}", module, func); + let predefined = ["os.system", "subprocess.run", "os.run"]; + + if predefined.contains(&combined.as_str()) + { + // println!("Matched: {}", combined); + true + } + else + { + { + false + } + } + } + + pub fn extract_sys_calls(file_path: &Path) -> HashSet + { + let mut calls = HashSet::new(); // variable to hold the final grouping of calls + let source_code = match fs::read_to_string(file_path) // read the file into a string + { + Ok(content) => content, + Err(e) => + { + eprintln!("Error reading {}: {}", file_path.to_str().unwrap(), e); + return calls; + } + }; + + // parse with tree-sitter + let mut parser = Parser::new(); // create a new parser + parser + .set_language(&tree_sitter_python::LANGUAGE.into()) // set the parser language + .expect("Error loading Python grammar"); + let tree = parser.parse(&source_code, None).unwrap(); // create a tree + let root = tree.root_node(); // set the root node + + let mut query_cursor = QueryCursor::new(); // object to query the tree + let mut matches = + query_cursor.matches(&PYTHON_SYS_CALL_QUERY, root, source_code.as_bytes()); // look for matches in the src file as bytes + + while let Some(m) = matches.next() + // loop to process each match that is found + { + // capture slots + let mut func_name: Option = None; // variable to hold the function name + let mut args_node = None; // variable to hold the args + let mut module_name = None; // variable to hold the combined module and function name + + for QueryCapture { node, index, .. } in m.captures + // for loop to loop over the matches + { + let capture_name = &PYTHON_SYS_CALL_QUERY.capture_names()[*index as usize][..]; // represents the current capture + match capture_name // set the func_name and args_node variables to what was in the capture + { + "function_name" => + { + if let Ok(t) = node.utf8_text(source_code.as_bytes()) + { + func_name = Some(t.to_string()); + } + } + "arg_list" => + { + args_node = Some(node); + } + "module" => + { + if let Ok(t) = node.utf8_text(source_code.as_bytes()) + { + module_name = Some(t.to_string()); + } + } + _ => {} + } + } + + if let (Some(f), Some(arg_list_node)) = (func_name, args_node) + // check if both variables are not None + { + // println!("func_name = {} ", f); + // println!("module_name = {:?} ", module_name); + + if let Some(ref module) = module_name + { + if !Self::is_likely_syscall(module, &f) + { + continue; // not a system call we're interested in at the moment, so skip analysis + } + + let mut stack = vec![*arg_list_node]; + while let Some(node) = stack.pop() { + if node.kind() == "string" { + if let Ok(raw) = node.utf8_text(source_code.as_bytes()) { + let cleaned = raw + .trim_matches('"') + .trim_matches('\'') + .replace('\n', " "); + if let Some(cmd) = bash_parser::parse_bash_command(&cleaned) { + calls.insert(LangInclude::OS(SystemProgram::Application(cmd))); + } + } + continue; // we've handled this node; don't also walk its children + } + + let mut child_cursor = node.walk(); + for child in node.children(&mut child_cursor) { + stack.push(child); + } + } + + } + } + + } + + calls + + } + + + fn process_files(&self, file_paths: T) -> HashMap>> where T: IntoIterator, T::Item: AsRef, { //Using Rayon for parallel processing associates wrapping set with Mutex for synchronization let global_imports: Mutex> = Mutex::new(HashSet::new()); + let global_sys_calls: Mutex> = Mutex::new(HashSet::new()); par_file_iter(file_paths, |file_path| { let file_includes = Self::extract_includes(file_path); @@ -464,8 +615,18 @@ impl<'db> PythonParser<'db> { for include in file_includes { global_includes.insert(include); } + // syscalls + let file_sys_calls = Self::extract_sys_calls(file_path); + { + let mut g = global_sys_calls.lock().unwrap(); + for sc in file_sys_calls { + g.insert(sc); + } + } }); + + //Prepare SQL for database query //TODO: Double check this, might want to normalize and change query to normalized_name let mut sql_statement = self @@ -517,7 +678,33 @@ impl<'db> PythonParser<'db> { } } - global_import_map + // Map collected syscalls to packages + let global_sys_calls = mem::take(&mut *global_sys_calls.lock().unwrap()); + let mut syscall_map: HashMap>> = HashMap::new(); + + for call in global_sys_calls.into_iter() { + let func_name = match &call { + LangInclude::OS(SystemProgram::Application(cmd)) => cmd.to_lowercase(), + _ => continue, + }; + + if let Ok(libs) = query_db(&func_name) { + if !libs.is_empty() { + syscall_map.insert(call, vec![libs]); + } + } + } + + // Merge both maps into the required return type + let mut result: HashMap>> = HashMap::new(); + for (import, libs) in global_import_map { + result.insert(LangInclude::Python(import), libs); + } + for (call, libs) in syscall_map { + result.insert(call, libs); + } + + result } } @@ -533,13 +720,11 @@ impl LibParser for PythonParser<'_> { .collect() } - fn extract_sys_calls(_file_path: &Path) -> HashSet + fn extract_sys_calls(file_path: &Path) -> HashSet where Self: Sized, { - //Argument _file_path prefixed with underscore to prevent complaints from cargo - //Rename to "file_path" when implementing - HashSet::new() + Self::extract_sys_calls(file_path) } } @@ -549,10 +734,9 @@ impl LibProcessor for PythonParser<'_> { T: IntoIterator, T::Item: AsRef, { - // fn process_files(&self, file_path: Vec<&str>) -> Vec<(LangInclude, Vec)>{ self.process_files(file_paths) .into_iter() - .map(|(python_include, vec)| (LangInclude::Python(python_include), vec)) + //.map(|(python_include, vec)| (python_include, vec)) .collect() } } @@ -638,4 +822,36 @@ mod tests { .collect(); assert_eq!(from_alias_imports, exp_from_alias_imports); } + + #[test] + fn test_extract_python_syscalls() { + let test_file = Path::new("tests/test_files/test_sys_calls.py"); + // Run the extractor + let calls = PythonParser::extract_sys_calls(test_file); + + // Collect only OS application names from the result set + let mut found: HashSet = HashSet::new(); + for call in calls { + if let LangInclude::OS(SystemProgram::Application(name)) = call { + found.insert(name); + } + } + + // Positive cases + assert!( + found.contains("ls"), + "Expected to find 'ls' from os.system(\"ls -l /tmp\")" + ); + assert!( + found.contains("echo"), + "Expected to find 'echo' from subprocess.run(\"echo hello\", shell=True)" + ); + + // Negative case: free-standing run(...) is not in the allowed (module.func) list + assert!( + !found.contains("rm"), + "Did not expect to match free-standing run(\"rm ...\")" + ); + } + } From f398d50cf2d7babcc34ba83b0be6f975fa43f5bb Mon Sep 17 00:00:00 2001 From: Micaela Date: Wed, 1 Oct 2025 16:04:43 -0700 Subject: [PATCH 2/7] fix: apply formatting --- src/parsing/python_parser.rs | 38 +++++++++++------------------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/src/parsing/python_parser.rs b/src/parsing/python_parser.rs index c2527f8..43069f7 100644 --- a/src/parsing/python_parser.rs +++ b/src/parsing/python_parser.rs @@ -477,26 +477,21 @@ impl<'db> PythonParser<'db> { imports } - fn is_likely_syscall(module: &str, func: &str) -> bool - { + fn is_likely_syscall(module: &str, func: &str) -> bool { let combined = format!("{}.{}", module, func); - let predefined = ["os.system", "subprocess.run", "os.run"]; + let predefined = ["os.system", "subprocess.run", "os.run"]; - if predefined.contains(&combined.as_str()) - { + if predefined.contains(&combined.as_str()) { // println!("Matched: {}", combined); true - } - else - { + } else { { false } } } - pub fn extract_sys_calls(file_path: &Path) -> HashSet - { + pub fn extract_sys_calls(file_path: &Path) -> HashSet { let mut calls = HashSet::new(); // variable to hold the final grouping of calls let source_code = match fs::read_to_string(file_path) // read the file into a string { @@ -561,11 +556,9 @@ impl<'db> PythonParser<'db> { { // println!("func_name = {} ", f); // println!("module_name = {:?} ", module_name); - - if let Some(ref module) = module_name - { - if !Self::is_likely_syscall(module, &f) - { + + if let Some(ref module) = module_name { + if !Self::is_likely_syscall(module, &f) { continue; // not a system call we're interested in at the moment, so skip analysis } @@ -573,10 +566,8 @@ impl<'db> PythonParser<'db> { while let Some(node) = stack.pop() { if node.kind() == "string" { if let Ok(raw) = node.utf8_text(source_code.as_bytes()) { - let cleaned = raw - .trim_matches('"') - .trim_matches('\'') - .replace('\n', " "); + let cleaned = + raw.trim_matches('"').trim_matches('\'').replace('\n', " "); if let Some(cmd) = bash_parser::parse_bash_command(&cleaned) { calls.insert(LangInclude::OS(SystemProgram::Application(cmd))); } @@ -589,16 +580,12 @@ impl<'db> PythonParser<'db> { stack.push(child); } } - } } - } calls - } - fn process_files(&self, file_paths: T) -> HashMap>> where @@ -625,8 +612,6 @@ impl<'db> PythonParser<'db> { } }); - - //Prepare SQL for database query //TODO: Double check this, might want to normalize and change query to normalized_name let mut sql_statement = self @@ -822,7 +807,7 @@ mod tests { .collect(); assert_eq!(from_alias_imports, exp_from_alias_imports); } - + #[test] fn test_extract_python_syscalls() { let test_file = Path::new("tests/test_files/test_sys_calls.py"); @@ -853,5 +838,4 @@ mod tests { "Did not expect to match free-standing run(\"rm ...\")" ); } - } From 12110cd8927a3fe0f9fdc68f5c4fb663b2ea196e Mon Sep 17 00:00:00 2001 From: Micaela Date: Wed, 1 Oct 2025 16:24:58 -0700 Subject: [PATCH 3/7] feat: add test file for extract_sys_calls function --- tests/test_files/test_sys_calls.py | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 tests/test_files/test_sys_calls.py diff --git a/tests/test_files/test_sys_calls.py b/tests/test_files/test_sys_calls.py new file mode 100644 index 0000000..58ed330 --- /dev/null +++ b/tests/test_files/test_sys_calls.py @@ -0,0 +1,38 @@ +import os +import subprocess +import subprocess as sp +from subprocess import run as sproot_run + +# Optional: make the file executable without AttributeError on os.run. +if not hasattr(os, "run"): + def _os_run(cmd, *args, **kwargs): + return os.system(cmd) + os.run = _os_run # type: ignore[attr-defined] + +def positives(): + # Should match: "os.system" + os.system("ls -l /tmp") + + # Should match: "os.system" + os.system("echo from os.system") + + # Should match: "subprocess.run" + subprocess.run(["echo", "from subprocess.run"]) + + # Should match: "os.run" + os.run("echo from os.run") + + # Depending on your resolver, these MAY or MAY NOT match + # (aliases/from-imports). Keep them if you want extra coverage. + sp.run("echo from sp.run", shell=True) # alias of subprocess + sproot_run(["echo", "from from-import"]) # from-import alias + +def negatives(): + # Should NOT match (not in the allowlist) + subprocess.check_call(["echo", "should NOT match"]) + os.path.join("a", "b") # not a syscall + run("rm -rf /tmp/somewhere") + +if __name__ == "__main__": + positives() + negatives() \ No newline at end of file From 6d8dcebaa885c5df1b53aebd33a40dcec9953d9d Mon Sep 17 00:00:00 2001 From: Micaela Date: Wed, 8 Oct 2025 17:12:29 -0700 Subject: [PATCH 4/7] fix: fix clippy errors --- src/parsing/cmake_parser.rs | 2 +- src/parsing/python_parser.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/parsing/cmake_parser.rs b/src/parsing/cmake_parser.rs index 1e92c0a..999589c 100644 --- a/src/parsing/cmake_parser.rs +++ b/src/parsing/cmake_parser.rs @@ -112,7 +112,7 @@ impl CMakeParser { arg_dict.insert(pair[0].clone(), pair[1].clone()); } if let Some(dangling) = groups.chunks_exact(2).remainder().first() { - eprintln!("Warning: Dangling key without value: {}", dangling); + eprintln!("Warning: Dangling key without value: {dangling}"); } if arg_dict.contains_key("URL") { diff --git a/src/parsing/python_parser.rs b/src/parsing/python_parser.rs index 43069f7..395f29c 100644 --- a/src/parsing/python_parser.rs +++ b/src/parsing/python_parser.rs @@ -478,7 +478,7 @@ impl<'db> PythonParser<'db> { } fn is_likely_syscall(module: &str, func: &str) -> bool { - let combined = format!("{}.{}", module, func); + let combined = format!("{module}.{func}"); let predefined = ["os.system", "subprocess.run", "os.run"]; if predefined.contains(&combined.as_str()) { @@ -526,7 +526,7 @@ impl<'db> PythonParser<'db> { for QueryCapture { node, index, .. } in m.captures // for loop to loop over the matches { - let capture_name = &PYTHON_SYS_CALL_QUERY.capture_names()[*index as usize][..]; // represents the current capture + let capture_name = PYTHON_SYS_CALL_QUERY.capture_names()[*index as usize]; // represents the current capture match capture_name // set the func_name and args_node variables to what was in the capture { "function_name" => From 8cb6a829126ec191955d9e0584b4c7f1ebbf6f7f Mon Sep 17 00:00:00 2001 From: Micaela Date: Mon, 20 Oct 2025 18:09:35 -0700 Subject: [PATCH 5/7] fix: Remove os.run from parser and test file since it doesn't exist --- src/parsing/python_parser.rs | 11 ++++++----- tests/test_files/test_sys_calls.py | 7 ------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/src/parsing/python_parser.rs b/src/parsing/python_parser.rs index 395f29c..fc5273c 100644 --- a/src/parsing/python_parser.rs +++ b/src/parsing/python_parser.rs @@ -479,10 +479,9 @@ impl<'db> PythonParser<'db> { fn is_likely_syscall(module: &str, func: &str) -> bool { let combined = format!("{module}.{func}"); - let predefined = ["os.system", "subprocess.run", "os.run"]; + let predefined = ["os.system", "subprocess.run"]; if predefined.contains(&combined.as_str()) { - // println!("Matched: {}", combined); true } else { { @@ -674,9 +673,11 @@ impl<'db> PythonParser<'db> { }; if let Ok(libs) = query_db(&func_name) { - if !libs.is_empty() { - syscall_map.insert(call, vec![libs]); - } + // Uncomment below if you only want to return strings that appear in the database + // if !libs.is_empty() { + // syscall_map.insert(call, vec![libs]); + // } + syscall_map.insert(call, vec![libs]); } } diff --git a/tests/test_files/test_sys_calls.py b/tests/test_files/test_sys_calls.py index 58ed330..56d0c67 100644 --- a/tests/test_files/test_sys_calls.py +++ b/tests/test_files/test_sys_calls.py @@ -3,11 +3,6 @@ import subprocess as sp from subprocess import run as sproot_run -# Optional: make the file executable without AttributeError on os.run. -if not hasattr(os, "run"): - def _os_run(cmd, *args, **kwargs): - return os.system(cmd) - os.run = _os_run # type: ignore[attr-defined] def positives(): # Should match: "os.system" @@ -19,8 +14,6 @@ def positives(): # Should match: "subprocess.run" subprocess.run(["echo", "from subprocess.run"]) - # Should match: "os.run" - os.run("echo from os.run") # Depending on your resolver, these MAY or MAY NOT match # (aliases/from-imports). Keep them if you want extra coverage. From 95df09781ad09c8c411e113fc81fe8c7e3d254e8 Mon Sep 17 00:00:00 2001 From: Micaela Date: Mon, 20 Oct 2025 18:16:06 -0700 Subject: [PATCH 6/7] fix: cargo fmt error --- src/parsing/python_parser.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parsing/python_parser.rs b/src/parsing/python_parser.rs index fc5273c..589f271 100644 --- a/src/parsing/python_parser.rs +++ b/src/parsing/python_parser.rs @@ -676,7 +676,7 @@ impl<'db> PythonParser<'db> { // Uncomment below if you only want to return strings that appear in the database // if !libs.is_empty() { // syscall_map.insert(call, vec![libs]); - // } + // } syscall_map.insert(call, vec![libs]); } } From 598c4897e8f2030f8453013121b773027c88b5f4 Mon Sep 17 00:00:00 2001 From: Micaela Date: Mon, 20 Oct 2025 18:19:13 -0700 Subject: [PATCH 7/7] fix: cargo clippy error --- src/parsing/python_parser.rs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/parsing/python_parser.rs b/src/parsing/python_parser.rs index 589f271..9ab16aa 100644 --- a/src/parsing/python_parser.rs +++ b/src/parsing/python_parser.rs @@ -481,13 +481,7 @@ impl<'db> PythonParser<'db> { let combined = format!("{module}.{func}"); let predefined = ["os.system", "subprocess.run"]; - if predefined.contains(&combined.as_str()) { - true - } else { - { - false - } - } + predefined.contains(&combined.as_str()) } pub fn extract_sys_calls(file_path: &Path) -> HashSet {