Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ repos:
rev: v0.14.4
hooks:
- id: ruff-check
args: [--fix, --exit-non-zero-on-fix, --no-cache, --exclude=samples]
args: [--fix, --exit-non-zero-on-fix, --no-cache]
exclude: (^|/)(samples|paddle_samples)/

- repo: https://github.com/Lucas-C/pre-commit-hooks.git
rev: v1.5.1
Expand Down
40 changes: 35 additions & 5 deletions graph_net/test/paddle_nlp_model_getter.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Reference implementation: https://github.com/PaddlePaddle/PaddleTest/tree/develop/framework/e2e/PaddleLT_new/layerNLPcase/transformers


def get_auto_model_and_inputs(model_name, text, dtype):
from paddlenlp.transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

Expand Down Expand Up @@ -139,18 +142,18 @@ def get_xlnet_model_and_inputs(model_name, text, dtype):

tokenizer = XLNetTokenizer.from_pretrained(model_name)

enc = tokenizer(
inputs = tokenizer(
text,
return_tensors="pd",
padding=True,
truncation=True,
# max_length=512,
)
if "attention_mask" not in enc:
input_ids = enc["input_ids"]
if "attention_mask" not in inputs:
input_ids = inputs["input_ids"]
pad_id = tokenizer.pad_token_id
enc["attention_mask"] = (input_ids != pad_id).astype("int64")
return model, enc
inputs["attention_mask"] = (input_ids != pad_id).astype("int64")
return model, inputs


def get_fnet_model_and_inputs(model_name, text, dtype):
Expand All @@ -163,3 +166,30 @@ def get_fnet_model_and_inputs(model_name, text, dtype):
tokenizer = FNetTokenizer.from_pretrained(model_name)
inputs = tokenizer(text, return_tensors="pd")
return model, inputs


def get_prophetnet_model_and_inputs(model_name, text, dtype):
import paddle
from paddlenlp.transformers import ProphetNetModel, ProphetNetConfig
from paddlenlp.transformers import ProphetNetTokenizer

config = ProphetNetConfig.from_pretrained(model_name)
model = ProphetNetModel(config)

tokenizer = ProphetNetTokenizer.from_pretrained(model_name)
inputs = tokenizer(text, return_tensors="pd")
inputs.pop("token_type_ids", None)

if "attention_mask" not in inputs:
input_ids = inputs["input_ids"]
pad_id = tokenizer.pad_token_id
inputs["attention_mask"] = (input_ids != pad_id).astype("int64")

if "decoder_input_ids" not in inputs:
batch_size = inputs["input_ids"].shape[0]
decoder_input_ids = paddle.full(
[batch_size, 1], tokenizer.bos_token_id, dtype="int64"
)
inputs["decoder_input_ids"] = decoder_input_ids

return model, inputs
6 changes: 6 additions & 0 deletions graph_net/test/paddle_nlp_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,12 @@ def extract_fnet_models(text_en, text_cn):
process_model(model_name, nlp_model_getter.get_fnet_model_and_inputs, text_en)


def extract_prophetnet_models(text_en, text_cn):
# prophetnet-large-uncased models
model_name = "prophetnet-large-uncased"
process_model(model_name, nlp_model_getter.get_prophetnet_model_and_inputs, text_en)


def main():
current_dir = os.path.dirname(os.path.abspath(__file__))
dump_dir = os.path.join(current_dir, "dump")
Expand Down
2 changes: 1 addition & 1 deletion paddle_samples/PaddleNLP/bert-base-cased/graph_hash.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
517608d4d2699e09c6171648da38a4f924556cf25abd97875599acfdda5807e4
a354c7a9af04a38be394ae238e9a62e2595ecb8743b22059be566ce6ae3d04e3
2 changes: 2 additions & 0 deletions paddle_samples/PaddleNLP/bert-base-cased/input_meta.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
class Program_weight_tensor_data_0:
name = "data_0"
original_name = "input_ids"
shape = [1, 21]
dtype = "int64"
data = [
Expand Down Expand Up @@ -29,6 +30,7 @@ class Program_weight_tensor_data_0:

class Program_weight_tensor_data_1:
name = "data_1"
original_name = "token_type_ids"
shape = [1, 21]
dtype = "int64"
data = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Loading