Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
8ebc909
Enable to record tolerance for models.
Xreki Jan 20, 2026
7f3c8d9
Update auto_fault_locator.py.
Xreki Jan 20, 2026
5ec032d
Fix prophetnet-large-uncased.
Xreki Jan 20, 2026
ec2f60d
Add original_name for serveral PaddleNLP samples.
Xreki Jan 20, 2026
746cdf1
Add original_name for serveral PaddleCV samples.
Xreki Jan 20, 2026
fb8404c
Update graph_hash.
Xreki Jan 20, 2026
961e06d
Update pre-commit.
Xreki Jan 20, 2026
c2e3ee1
Merge branch 'add_original_name_3' into add_tolerance_record
Xreki Jan 20, 2026
b245ff0
Minor optimization.
Xreki Jan 21, 2026
197247e
Fix a sample to remove inf and avoid big number.
Xreki Jan 21, 2026
ea8acf3
Merge branch 'develop' into add_tolerance_record
Xreki Jan 21, 2026
76211b8
Minor fix.
Xreki Jan 21, 2026
6e9229e
Remove bad sample from config list, paddle_samples/PaddleX/PP-ShiTuV2…
Xreki Jan 22, 2026
e3c5e45
Merge branch 'develop' into add_tolerance_record
Xreki Jan 22, 2026
22e635a
Optimize the prologue unittest generator to compute the allclose resu…
Xreki Jan 26, 2026
70c7cc1
Merge branch 'develop' into add_tolerance_record
Xreki Jan 27, 2026
dd8946f
Merge branch 'develop' into add_tolerance_record
Xreki Jan 28, 2026
2ff543b
Change log-file to model-path-list.
Xreki Jan 28, 2026
9a7d6d5
Merge branch 'develop' into add_tolerance_record
Xreki Jan 28, 2026
549a7f1
Merge branch 'develop' into add_tolerance_record
Xreki Jan 29, 2026
71453c4
Merge branch 'develop' into add_tolerance_record
Xreki Feb 28, 2026
8c0a5a9
Merge branch 'develop' into add_tolerance_record
Xreki Mar 9, 2026
4227ca9
Fix eval model and remove the randomness.
Xreki Mar 9, 2026
56f532b
Enable set and unset of use_gpudnn.
Xreki Mar 9, 2026
0726146
Fix sample_list.
Xreki Mar 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 60 additions & 31 deletions graph_net/auto_fault_locator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

class AutoFaultLocator:
def __init__(self, args):
self.log_file = os.path.abspath(args.log_file)
self.model_path_list = os.path.abspath(args.model_path_list)
self.output_dir = os.path.abspath(args.output_dir)
self.framework = args.framework
self.decompose_method = args.decompose_method
Expand All @@ -23,20 +23,20 @@ def __init__(self, args):
self.machine = args.machine
self.port = args.port

def get_one_step_cmd(self, config_str):
config_b64 = convert_json_to_b64_string(config_str)
return [
def execute_one_step_cmd(self, test_config):
test_config_b64_str = convert_json_to_b64_string(test_config)
cmd = [
sys.executable,
"-m",
"graph_net.subgraph_decompose_and_evaluation_step",
"--log-file",
self.log_file,
"--model-path-list",
self.model_path_list,
"--output-dir",
self.output_dir,
"--framework",
self.framework,
"--test-config",
config_b64,
test_config_b64_str,
"--decompose-method",
self.decompose_method,
"--tolerance",
Expand All @@ -45,49 +45,78 @@ def get_one_step_cmd(self, config_str):
self.max_subgraph_size,
]

def run_remote_test_reference(self):
print(f"[AutoFaultLocator] Executing: {' '.join(cmd)}", flush=True)
result = subprocess.run(cmd, check=True, text=True)
return result

def run_test_reference_device(self, is_remote):
print(
"\n>>> [Step 1] Run Remote Reference Device (Decomposition And Evaluation)\n"
"\n>>> [AutoFaultLocator 2/1] Run Test Reference Device (Decomposition And Evaluation)\n",
flush=True,
)

test_remote_reference_device_config_str = {
"test_module_name": "test_remote_reference_device",
"test_remote_reference_device_arguments": {
test_module_name = (
"test_remote_reference_device" if is_remote else "test_reference_device"
)
test_reference_device_config = {
"test_module_name": test_module_name,
f"{test_module_name}_arguments": {
"model-path": None,
"reference-dir": None,
"compiler": "nope",
"device": self.reference_device,
"op-lib": "default",
"warmup": 5,
"trials": 20,
"seed": 123,
"machine": self.machine,
"port": self.port,
},
}

cmd = self.get_one_step_cmd(test_remote_reference_device_config_str)
print(f"Executing: {' '.join(cmd)}")
result = subprocess.run(cmd, check=True, text=True)
if args.framework == "torch":
test_reference_device_config[f"{test_module_name}_arguments"].update(
{"op-lib": "default"}
)
if is_remote:
test_reference_device_config[f"{test_module_name}_arguments"].update(
{
"machine": self.machine,
"port": self.port,
}
)

result = self.execute_one_step_cmd(test_reference_device_config)
assert (
result.returncode == 0
), f"Run Remote Reference Device failed with return code {result.returncode}"

def run_local_test_target(self):
print("\n>>> [Step 2] Run Local Target Device (Evaluation And Analysis)\n")
def run_test_target_device(self, is_remote):
print(
"\n>>> [AutoFaultLocator 2/2] Run Test Target Device (Evaluation And Analysis)\n",
flush=True,
)

test_target_device_config_str = {
"test_module_name": "test_target_device",
"test_target_device_arguments": {
test_module_name = (
"test_remote_target_device" if is_remote else "test_target_device"
)
test_target_device_config = {
"test_module_name": test_module_name,
f"{test_module_name}_arguments": {
"model-path": None,
"reference-dir": None,
"compiler": "nope",
"device": self.target_device,
"warmup": 5,
"trials": 20,
"seed": 123,
},
}

cmd = self.get_one_step_cmd(test_target_device_config_str)
print(f"Executing: {' '.join(cmd)}")
result = subprocess.run(cmd, check=True, text=True)
if is_remote:
test_target_device_config[f"{test_module_name}_arguments"].update(
{
"machine": self.machine,
"port": self.port,
}
)

result = self.execute_one_step_cmd(test_target_device_config)
assert (
result.returncode == 0
), f"Run Local Target Device failed with return code {result.returncode}"
Expand All @@ -114,16 +143,16 @@ def analyze_and_decide_next(self):
def main(args):
locator = AutoFaultLocator(args)
while True:
locator.run_remote_test_reference()
locator.run_local_test_target()
locator.run_test_reference_device(is_remote=False)
locator.run_test_target_device(is_remote=True)
should_continue = locator.analyze_and_decide_next()
if not should_continue:
break


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--log-file", type=str, required=True)
parser.add_argument("--model-path-list", type=str, required=True)
parser.add_argument("--output-dir", type=str, required=True)
parser.add_argument(
"--framework", type=str, choices=["paddle", "torch"], required=True
Expand Down
1 change: 0 additions & 1 deletion graph_net/config/paddle_samples_list.txt
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,6 @@ paddle_samples/PaddleX/ResNet101_vd
paddle_samples/PaddleNLP/roformer_v2_chinese_char_small
paddle_samples/PaddleX/Mask-RT-DETR-X/subgraph_0
paddle_samples/PaddleX/Mask-RT-DETR-X/subgraph_1
paddle_samples/PaddleX/PP-ShiTuV2_det/subgraph_1
paddle_samples/PaddleX/MobileNetV4_conv_medium/subgraph_0
paddle_samples/PaddleX/PicoDet-S/subgraph_9
paddle_samples/PaddleX/PP-HGNetV2-B5/subgraph_0
Expand Down
42 changes: 31 additions & 11 deletions graph_net/paddle/prologue_subgraph_unittest_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,22 +270,42 @@ def _get_output_shapes(outs):
)
self.assertTrue(shape_match, f"Shape of outputs are not matched ({reference_shapes=} vs {target_shapes=}).")

def check_results(self, reference_outputs, target_outputs):
def _convert_to_numpy(out):
if out.dtype not in [paddle.float32, paddle.float64]:
return out.cast("float32").numpy()
else:
return out.numpy()
def convert_to_numpy(self, out):
if out.dtype not in [paddle.float32, paddle.float64]:
return out.cast("float32").numpy()
else:
return out.numpy()

def get_all_cmp_results(self, reference_outputs, target_outputs, name):
def _get_cmp_allclose(cmp_arrays, tolerance):
return [
int(np.allclose(actual, desired, atol=atol, rtol=rtol))
for dtype, actual, desired in cmp_arrays
for (atol, rtol) in [tolerance_generator(tolerance, dtype)]
]

cmp_arrays = [
(reference.dtype, self.convert_to_numpy(reference), self.convert_to_numpy(target))
for reference, target in zip(reference_outputs, target_outputs)]
for tolerance in range(-10, 2):
cmp_results = _get_cmp_allclose(cmp_arrays, tolerance)
is_correct = all(x == 1 for x in cmp_results)
cmp_results_str = " ".join(str(v) for v in cmp_results)
print(f"{name}, tolerance: {tolerance:3d}, allclose: {is_correct}, cmp_result: {cmp_results_str}")
print()

def check_results(self, reference_outputs, target_outputs, name):
assert len(reference_outputs) == len(target_outputs), f"The number of outputs is not equal ({len(reference_outputs)=} vs {len(target_outputs)})."
self.check_dtypes(reference_outputs, target_outputs)
self.check_shapes(reference_outputs, target_outputs)

self.get_all_cmp_results(reference_outputs, target_outputs, name)

for reference, target in zip(reference_outputs, target_outputs):
atol, rtol = tolerance_generator(self.tolerance, reference.dtype)
np.testing.assert_allclose(
actual=_convert_to_numpy(target),
desired=_convert_to_numpy(reference),
actual=self.convert_to_numpy(target),
desired=self.convert_to_numpy(reference),
atol=atol,
rtol=rtol,
)
Expand All @@ -302,7 +322,7 @@ def test_separated(self):
print(f"Load prologue output tensors from {prologue_output_path}")
prologue_reference_outputs = paddle.load(prologue_output_path)
with self.subTest(name="check_prologue_outputs"):
self.check_results(prologue_reference_outputs, prologue_outputs)
self.check_results(prologue_reference_outputs, prologue_outputs, name="check_prologue_outputs")

test_output_path = os.path.join(self.reference_dir, "{{graph_module_desc.model_name}}_separated_reference.pdout")
test_outputs = self.run_suspect_layer(prologue_reference_outputs)
Expand All @@ -313,7 +333,7 @@ def test_separated(self):
print(f"Load test output tensors on reference device from {test_output_path}.")
test_reference_outputs = paddle.load(test_output_path)
with self.subTest(name="check_suspect_outputs"):
self.check_results(test_reference_outputs, test_outputs)
self.check_results(test_reference_outputs, test_outputs, name="check_suspect_outputs")

def test_combined(self):
paddle.seed(self.runtime_seed)
Expand All @@ -326,7 +346,7 @@ def test_combined(self):
print(f"Load test output tensors on reference device from {test_output_path}.")
test_reference_outputs = paddle.load(test_output_path)
with self.subTest(name="check_combined_outputs"):
self.check_results(test_reference_outputs, test_outputs)
self.check_results(test_reference_outputs, test_outputs, name="check_combined_outputs")


if __name__ == "__main__":
Expand Down
3 changes: 1 addition & 2 deletions graph_net/paddle/test_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,7 @@ def load_class_from_file(file_path: str, class_name: str):

with open(file_path, "r", encoding="utf-8") as f:
original_code = f.read()
import_stmt = "import paddle"
modified_code = f"{import_stmt}\n{original_code}"
modified_code = utils.rewrite_model(original_code)
spec = importlib.util.spec_from_loader(module_name, loader=None)
module = importlib.util.module_from_spec(spec)
sys.modules[module_name] = module
Expand Down
Loading