diff --git a/example/extract/extract_pdf.ipynb b/example/extract/extract_pdf.ipynb index 05387ffe..459197d6 100644 --- a/example/extract/extract_pdf.ipynb +++ b/example/extract/extract_pdf.ipynb @@ -220,27 +220,35 @@ "metadata": {}, "outputs": [], "source": [ + "instruction=\"\"\"Generate one question and its corresponding answer based on the last context in the last \\\n", + "example. Follow the format of the examples below to include context, question, and answer in the response\"\"\"\n", + "\n", + "context=\"\"\"In 1948, Claude E. Shannon published A Mathematical Theory of\\nCommunication (Shannon, 1948) establishing the theory of\\ninformation. In his article, Shannon introduced the concept of\\ninformation entropy for the first time. We will begin our journey here.\"\"\"\n", + "question=\"\"\"Who published A Mathematical Theory of Communication in 1948?\"\"\"\n", + "answer=\"\"\"Claude E. Shannon.\"\"\"\n", + "\n", "guided_prompt = PromptTemplate(\n", - " instruction=\"\"\"Generate one question and its corresponding answer based on the last context in the last\n", - " example. Follow the format of the examples below to include context, question, and answer in the response\"\"\",\n", + " instruction=instruction,\n", " few_shot_prompt=[Context(\n", - " context=\"In 1948, Claude E. Shannon published A Mathematical Theory of\\nCommunication (Shannon, 1948) establishing the theory of\\ninformation. In his article, Shannon introduced the concept of\\ninformation entropy for the first time. We will begin our journey here.\"\"\",\n", - " question=\"Who published A Mathematical Theory of Communication in 1948?\"\"\",\n", - " answer=\"Claude E. Shannon.\"\"\"\n", + " context=context,\n", + " question=question,\n", + " answer=answer\n", " )]\n", ")\n", + "\n", "input_data = [\n", - " Context(\n", - " context=p[:1000],\n", - " question=\"\",\n", - " answer=\"\",\n", - " )\n", - " for p in contexts\n", + " Context(\n", + " context=p[:1000],\n", + " question=\"\"\"\"\"\",\n", + " answer=\"\"\"\"\"\",\n", + " )\n", + " for p in contexts\n", "]\n" ] }, { "cell_type": "markdown", + "id": "8f7be007", "metadata": {}, "source": [ "### Run the model\n", @@ -540,6 +548,12 @@ " \n", "" ] + }, + { + "cell_type": "markdown", + "id": "9d0ada9b", + "metadata": {}, + "source": [] } ], "metadata": { diff --git a/example/llm/sagemaker_deploy_mistral.ipynb b/example/llm/sagemaker_deploy_mistral.ipynb index 1d7f8f59..13f609ee 100644 --- a/example/llm/sagemaker_deploy_mistral.ipynb +++ b/example/llm/sagemaker_deploy_mistral.ipynb @@ -703,7 +703,7 @@ "\n", "\n", " \n", - "\n" + "" ] } ], diff --git a/example/pipeline/pipeline_pdf.ipynb b/example/pipeline/pipeline_pdf.ipynb index c7968f3c..54db17b3 100644 --- a/example/pipeline/pipeline_pdf.ipynb +++ b/example/pipeline/pipeline_pdf.ipynb @@ -139,13 +139,19 @@ "metadata": {}, "outputs": [], "source": [ + "instruction=\"\"\"Generate one question and its corresponding answer based on the last context in the last \\\n", + "example. Follow the format of the examples below to include context, question, and answer in the response\"\"\"\n", + "\n", + "context=\"\"\"In 1948, Claude E. Shannon published A Mathematical Theory of\\nCommunication (Shannon, 1948) establishing the theory of\\ninformation. In his article, Shannon introduced the concept of\\ninformation entropy for the first time. We will begin our journey here.\"\"\"\n", + "question=\"\"\"Who published A Mathematical Theory of Communication in 1948?\"\"\"\n", + "answer=\"\"\"Claude E. Shannon.\"\"\"\n", + "\n", "guided_prompt = PromptTemplate(\n", - " instruction=\"\"\"Generate one question and its corresponding answer based on the last context in the last\n", - " example. Follow the format of the examples below to include context, question, and answer in the response\"\"\",\n", + " instruction=instruction,\n", " few_shot_prompt=[Context(\n", - " context=\"In 1948, Claude E. Shannon published A Mathematical Theory of\\nCommunication (Shannon, 1948) establishing the theory of\\ninformation. In his article, Shannon introduced the concept of\\ninformation entropy for the first time. We will begin our journey here.\"\"\",\n", - " question=\"Who published A Mathematical Theory of Communication in 1948?\"\"\",\n", - " answer=\"Claude E. Shannon.\"\"\"\n", + " context=context,\n", + " question=question,\n", + " answer=answer\n", " )]\n", ")\n" ] diff --git a/example/pipeline/pipeline_pdf_extract_transform.ipynb b/example/pipeline/pipeline_pdf_extract_transform.ipynb index 86997598..8be92963 100644 --- a/example/pipeline/pipeline_pdf_extract_transform.ipynb +++ b/example/pipeline/pipeline_pdf_extract_transform.ipynb @@ -149,14 +149,20 @@ "metadata": {}, "outputs": [], "source": [ + "instruction=\"\"\"Generate one question and its corresponding answer based on the last context in the last \\\n", + "example. Follow the format of the examples below to include context, question, and answer in the response\"\"\"\n", + "\n", + "context=\"\"\"In 1948, Claude E. Shannon published A Mathematical Theory of\\nCommunication (Shannon, 1948) establishing the theory of\\ninformation. In his article, Shannon introduced the concept of\\ninformation entropy for the first time. We will begin our journey here.\"\"\"\n", + "question=\"\"\"Who published A Mathematical Theory of Communication in 1948?\"\"\"\n", + "answer=\"\"\"Claude E. Shannon.\"\"\"\n", + "\n", "guided_prompt = PromptTemplate(\n", - " instruction=\"\"\"Generate one question and its corresponding answer based on the last context in the last\n", - " example. Follow the format of the examples below to include context, question, and answer in the response\"\"\",\n", + " instruction=instruction,\n", " few_shot_prompt=[\n", " Context(\n", - " context=\"In 1948, Claude E. Shannon published A Mathematical Theory of\\nCommunication (Shannon, 1948) establishing the theory of\\ninformation. In his article, Shannon introduced the concept of\\ninformation entropy for the first time. We will begin our journey here.\",\n", - " question=\"Who published A Mathematical Theory of Communication in 1948?\",\n", - " answer=\"Claude E. Shannon.\",\n", + " context=context,\n", + " question=question,\n", + " answer=answer,\n", " ),\n", " \n", "])" diff --git a/example/rater/bedrock_classification.ipynb b/example/rater/bedrock_classification.ipynb index 644fd47f..27e473ca 100644 --- a/example/rater/bedrock_classification.ipynb +++ b/example/rater/bedrock_classification.ipynb @@ -132,15 +132,15 @@ "outputs": [], "source": [ "raw_input = [\n", - " (\"The Pacific Ocean is the largest and deepest of Earth's oceanic divisions. It extends from the Arctic Ocean in the north to the Southern Ocean in the south.\",\n", - " \"What is the largest ocean on Earth?\",\n", - " \"The largest ocean on Earth is the Pacific Ocean.\"), # correct\n", - " (\"Shakespeare, a renowned English playwright and poet, wrote 39 plays during his lifetime. His works include famous plays like 'Hamlet' and 'Romeo and Juliet'.\",\n", - " \"How many plays did Shakespeare write?\",\n", - " \"Shakespeare wrote 39 plays.\"), # correct\n", - " (\"The human brain is an intricate organ responsible for intelligence, memory, and emotions. It is made up of approximately 86 billion neurons.\",\n", - " \"What is the human brain responsible for?\",\n", - " \"The human brain is responsible for physical movement.\"), # incorrect\n", + " (\"\"\"The Pacific Ocean is the largest and deepest of Earth's oceanic divisions. It extends from the Arctic Ocean in the north to the Southern Ocean in the south.\"\"\",\n", + " \"\"\"What is the largest ocean on Earth?\"\"\",\n", + " \"\"\"The largest ocean on Earth is the Pacific Ocean.\"\"\"), # correct\n", + " (\"\"\"Shakespeare, a renowned English playwright and poet, wrote 39 plays during his lifetime. His works include famous plays like 'Hamlet' and 'Romeo and Juliet'.\"\"\",\n", + " \"\"\"How many plays did Shakespeare write?\"\"\",\n", + " \"\"\"Shakespeare wrote 39 plays.\"\"\"), # correct\n", + " (\"\"\"The human brain is an intricate organ responsible for intelligence, memory, and emotions. It is made up of approximately 86 billion neurons.\"\"\",\n", + " \"\"\"What is the human brain responsible for?\"\"\",\n", + " \"\"\"The human brain is responsible for physical movement.\"\"\"), # incorrect\n", "]" ] }, diff --git a/example/rater/classification.ipynb b/example/rater/classification.ipynb index bb61eab6..db5430e8 100644 --- a/example/rater/classification.ipynb +++ b/example/rater/classification.ipynb @@ -89,15 +89,15 @@ "outputs": [], "source": [ "raw_input = [\n", - " (\"The Pacific Ocean is the largest and deepest of Earth's oceanic divisions. It extends from the Arctic Ocean in the north to the Southern Ocean in the south.\",\n", - " \"What is the largest ocean on Earth?\",\n", - " \"The largest ocean on Earth is the Pacific Ocean.\"), # correct\n", - " (\"Shakespeare, a renowned English playwright and poet, wrote 39 plays during his lifetime. His works include famous plays like 'Hamlet' and 'Romeo and Juliet'.\",\n", - " \"How many plays did Shakespeare write?\",\n", - " \"Shakespeare wrote 31 plays.\"), # incorrect\n", - " (\"The human brain is an intricate organ responsible for intelligence, memory, and emotions. It is made up of approximately 86 billion neurons.\",\n", - " \"What is the human brain responsible for?\",\n", - " \"The human brain is responsible for physical movement.\"), # incorrect\n", + " (\"\"\"The Pacific Ocean is the largest and deepest of Earth's oceanic divisions. It extends from the Arctic Ocean in the north to the Southern Ocean in the south.\"\"\",\n", + " \"\"\"What is the largest ocean on Earth?\"\"\",\n", + " \"\"\"The largest ocean on Earth is the Pacific Ocean.\"\"\"), # correct\n", + " (\"\"\"Shakespeare, a renowned English playwright and poet, wrote 39 plays during his lifetime. His works include famous plays like 'Hamlet' and 'Romeo and Juliet'.\"\"\",\n", + " \"\"\"How many plays did Shakespeare write?\"\"\",\n", + " \"\"\"Shakespeare wrote 31 plays.\"\"\"), # incorrect\n", + " (\"\"\"The human brain is an intricate organ responsible for intelligence, memory, and emotions. It is made up of approximately 86 billion neurons.\"\"\",\n", + " \"\"\"What is the human brain responsible for?\"\"\",\n", + " \"\"\"The human brain is responsible for physical movement.\"\"\"), # incorrect\n", "]\n", "\n", "data = [\n", diff --git a/example/rater/generated_answer.ipynb b/example/rater/generated_answer.ipynb index 114a3664..5b2429da 100644 --- a/example/rater/generated_answer.ipynb +++ b/example/rater/generated_answer.ipynb @@ -90,18 +90,18 @@ "outputs": [], "source": [ "raw_input = [\n", - " (\"Reddit is an American social news aggregation, content rating, and discussion website. Registered users submit content to the site such as links, text posts, images, and videos, which are then voted up or down by other members.\",\n", - " \"What type of content can users submit on Reddit?\",\n", - " \"Users can only post text on Reddit.\",\n", - " \"Users on Reddit can submit various types of content including links, text posts, images, and videos.\"), # Better\n", - " (\"League of Legends (LoL), commonly referred to as League, is a 2009 multiplayer online battle arena video game developed and published by Riot Games. \",\n", - " \"When was League of Legends released?\",\n", - " \"League of Legends was released in 2009.\",\n", - " \"League of Legends was released in the early 2000s.\"), # Worse\n", - " (\"Vitamin C (also known as ascorbic acid and ascorbate) is a water-soluble vitamin found in citrus and other fruits, berries and vegetables, also sold as a dietary supplement and as a topical serum ingredient to treat melasma (dark pigment spots) and wrinkles on the face.\",\n", - " \"Is Vitamin C water-soluble?\",\n", - " \"Yes, Vitamin C is a very water-soluble vitamin.\",\n", - " \"Yes, Vitamin C can be dissolved in water well.\"), # Equally good\n", + " (\"\"\"Reddit is an American social news aggregation, content rating, and discussion website. Registered users submit content to the site such as links, text posts, images, and videos, which are then voted up or down by other members.\"\"\",\n", + " \"\"\"What type of content can users submit on Reddit?\"\"\",\n", + " \"\"\"Users can only post text on Reddit.\"\"\",\n", + " \"\"\"Users on Reddit can submit various types of content including links, text posts, images, and videos.\"\"\"), # Better\n", + " (\"\"\"League of Legends (LoL), commonly referred to as League, is a 2009 multiplayer online battle arena video game developed and published by Riot Games. \"\"\",\n", + " \"\"\"When was League of Legends released?\"\"\",\n", + " \"\"\"League of Legends was released in 2009.\"\"\",\n", + " \"\"\"League of Legends was released in the early 2000s.\"\"\"), # Worse\n", + " (\"\"\"Vitamin C (also known as ascorbic acid and ascorbate) is a water-soluble vitamin found in citrus and other fruits, berries and vegetables, also sold as a dietary supplement and as a topical serum ingredient to treat melasma (dark pigment spots) and wrinkles on the face.\"\"\",\n", + " \"\"\"Is Vitamin C water-soluble?\"\"\",\n", + " \"\"\"Yes, Vitamin C is a very water-soluble vitamin.\"\"\",\n", + " \"\"\"Yes, Vitamin C can be dissolved in water well.\"\"\"), # Equally good\n", "]\n", "\n", "data = [\n", diff --git a/example/rater/huggingface_classification.ipynb b/example/rater/huggingface_classification.ipynb index f1f851ce..5af8483a 100644 --- a/example/rater/huggingface_classification.ipynb +++ b/example/rater/huggingface_classification.ipynb @@ -165,7 +165,7 @@ "source": [ "config = RaterForClassificationHuggingfaceConfig(\n", " model_config=HuggingfaceModelConfig(\n", - " response_start_key=\"explanation\", \n", + " response_start_key=\"explanation\",\n", " response_format={\"type\": \"json_object\"},\n", " batch_size=2\n", " )\n", @@ -417,7 +417,7 @@ "\n", "config2 = RaterForClassificationHuggingfaceConfig(\n", " model_config=HuggingfaceModelConfig(\n", - " response_start_key=\"explanation\", \n", + " response_start_key=\"explanation\",\n", " response_format={\"type\": \"text\"},\n", " batch_size=3,\n", " do_sample=True,\n", diff --git a/example/transform/huggingface_model.ipynb b/example/transform/huggingface_model.ipynb index 8cda6aa0..409cef23 100644 --- a/example/transform/huggingface_model.ipynb +++ b/example/transform/huggingface_model.ipynb @@ -156,14 +156,14 @@ "\n", "sample_examples = [\n", " Context(\n", - " context=\"The quick brown fox jumps over the lazy dog.\",\n", - " question=\"What is the color of the fox?\",\n", - " answer=\"brown.\"\n", + " context=\"\"\"The quick brown fox jumps over the lazy dog.\"\"\",\n", + " question=\"\"\"What is the color of the fox?\"\"\",\n", + " answer=\"\"\"brown.\"\"\"\n", " ),\n", " Context(\n", - " context=\"The quick brown fox jumps over the lazy black dog.\",\n", - " question=\"What is the color of the dog?\",\n", - " answer=\"black.\"\n", + " context=\"\"\"The quick brown fox jumps over the lazy black dog.\"\"\",\n", + " question=\"\"\"What is the color of the dog?\"\"\",\n", + " answer=\"\"\"black.\"\"\"\n", " )]\n", "\n", "guided_prompt = PromptTemplate(\n", diff --git a/example/transform/huggingface_model_5QAs.ipynb b/example/transform/huggingface_model_5QAs.ipynb index 924d7de1..8d3ce88c 100644 --- a/example/transform/huggingface_model_5QAs.ipynb +++ b/example/transform/huggingface_model_5QAs.ipynb @@ -376,7 +376,6 @@ } ], "source": [ - "\n", "input_data = [\n", " Context(context=data)\n", " for data in raw_context_input_400\n", diff --git a/example/transform/huggingface_model_neuron.ipynb b/example/transform/huggingface_model_neuron.ipynb index 0c6af476..bcb408ec 100644 --- a/example/transform/huggingface_model_neuron.ipynb +++ b/example/transform/huggingface_model_neuron.ipynb @@ -149,14 +149,14 @@ "\n", "sample_examples = [\n", " Context(\n", - " context=\"The quick brown fox jumps over the lazy dog.\",\n", - " question=\"What is the color of the fox?\",\n", - " answer=\"brown.\"\n", + " context=\"\"\"The quick brown fox jumps over the lazy dog.\"\"\",\n", + " question=\"\"\"What is the color of the fox?\"\"\",\n", + " answer=\"\"\"brown.\"\"\"\n", " ),\n", " Context(\n", - " context=\"The quick brown fox jumps over the lazy black dog.\",\n", - " question=\"What is the color of the dog?\",\n", - " answer=\"black.\"\n", + " context=\"\"\"The quick brown fox jumps over the lazy black dog.\"\"\",\n", + " question=\"\"\"What is the color of the dog?\"\"\",\n", + " answer=\"\"\"black.\"\"\"\n", " )]\n", "\n", "guided_prompt = PromptTemplate(\n", @@ -243,7 +243,6 @@ } ], "source": [ - "\n", "input_data = [\n", " Context(context=data)\n", " for data in raw_context_input_400\n", diff --git a/example/transform/lmqg_model.ipynb b/example/transform/lmqg_model.ipynb index 3c6335ae..9d55640c 100644 --- a/example/transform/lmqg_model.ipynb +++ b/example/transform/lmqg_model.ipynb @@ -271,10 +271,10 @@ "outputs": [], "source": [ "raw_context_input = [\n", - " \"\"\"William Turner was an English painter who specialised in watercolour landscapes. He is often known\n", - " \"as William Turner of Oxford or just Turner of Oxford to distinguish him from his contemporary,\n", - " \"J. M. W. Turner. Many of Turner's paintings depicted the countryside around Oxford. One of his\n", - " \"best known pictures is a view of the city of Oxford from Hinksey Hill.\"\"\",\n", + " \"\"\"William Turner was an English painter who specialised in watercolour landscapes. He is often known \\\n", + "as William Turner of Oxford or just Turner of Oxford to distinguish him from his contemporary, \\\n", + "J. M. W. Turner. Many of Turner's paintings depicted the countryside around Oxford. One of his \\\n", + "best known pictures is a view of the city of Oxford from Hinksey Hill.\"\"\",\n", " \"\"\"My name is bobby and I am a talent software engineer working on AI/ML.\"\"\"\n", "]" ] diff --git a/example/transform/model.ipynb b/example/transform/model.ipynb index 024bacde..ca24c052 100644 --- a/example/transform/model.ipynb +++ b/example/transform/model.ipynb @@ -129,8 +129,8 @@ "outputs": [], "source": [ "raw_context_input = [\n", - " \"It was a sunny day and the sky color is blue.\",\n", - " \"My name is Bobby and I am a talent software engineer working on AI/ML\",\n", + " \"\"\"It was a sunny day and the sky color is blue.\"\"\",\n", + " \"\"\"My name is Bobby and I am a talent software engineer working on AI/ML\"\"\",\n", "]" ] }, diff --git a/example/transform/openai_json_model.ipynb b/example/transform/openai_json_model.ipynb index 7649b803..d2cf83b6 100644 --- a/example/transform/openai_json_model.ipynb +++ b/example/transform/openai_json_model.ipynb @@ -114,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -122,14 +122,14 @@ " instruction=\"Generate one question and its corresponding answer based on the context. Follow the format of the examples below to include context, question, and answer in the response in json\",\n", " few_shot_prompt=[\n", " Context(\n", - " context=\"The quick brown fox jumps over the lazy black dog.\",\n", - " question=\"What is the color of the fox?\",\n", - " answer=\"brown.\",\n", + " context=\"\"\"The quick brown fox jumps over the lazy black dog.\"\"\",\n", + " question=\"\"\"What is the color of the fox?\"\"\",\n", + " answer=\"\"\"brown.\"\"\"\n", " ),\n", " Context(\n", - " context=\"The quick brown fox jumps over the lazy black dog.\",\n", - " question=\"What is the color of the dog?\",\n", - " answer=\"black.\",\n", + " context=\"\"\"The quick brown fox jumps over the lazy black dog.\"\"\",\n", + " question=\"\"\"What is the color of the dog?\"\"\",\n", + " answer=\"\"\"black.\"\"\"\n", " ),\n", " ],\n", ")" @@ -149,8 +149,8 @@ "outputs": [], "source": [ "raw_context_input = [\n", - " \"It was a sunny day and the sky color is blue.\",\n", - " \"My name is bobby and I am a talent software engineer working on AI/ML.\",\n", + " \"\"\"It was a sunny day and the sky color is blue.\"\"\",\n", + " \"\"\"My name is bobby and I am a talent software engineer working on AI/ML.\"\"\"\n", "]" ] }, diff --git a/example/transform/openai_jupyter_notebook_QA.ipynb b/example/transform/openai_jupyter_notebook_QA.ipynb index 9b86224f..f1d4aad5 100644 --- a/example/transform/openai_jupyter_notebook_QA.ipynb +++ b/example/transform/openai_jupyter_notebook_QA.ipynb @@ -301,22 +301,29 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "guided_prompt = PromptTemplate(\n", - " instruction=\"If there is a code cell, generate one question given the markdown cell and its corresponding \\\n", + "instruction=\"\"\"If there is a code cell, generate one question given the markdown cell and its corresponding \\\n", "answer based on code cell and its output. If there is no code cell, generate one question and its corresponding \\\n", "answer based on context. Following the format of the examples below to include the same context, question, and \\\n", - "answer in the response.\",\n", + "answer in the response.\"\"\"\n", + "\n", + "context=\"\"\"'markdown' cell: '['### Use LLM to generate data in Uniflow. \\\n", + "In this example, we use the base `Config` defaults with the [OpenAIModelConfig] to generate questions and answers.']' \\\n", + "'code' cell: '['config = Config(model_config=OpenAIModelConfig())', 'client = Client(config)']'\"\"\"\n", + "question=\"\"\"How to use LLM to generate data in Uniflow\"\"\"\n", + "answer=\"\"\"We can use the Uniflow's default [OpenAIModelConfig] to generate questions and answers with \\\n", + "code: '['config = Config(model_config=OpenAIModelConfig())', 'client = Client(config)']'\"\"\"\n", + "\n", + "guided_prompt = PromptTemplate(\n", + " instruction=instruction,\n", " few_shot_prompt=[\n", " Context(\n", - " context=\"\"\"'markdown' cell: '['### Use LLM to generate data in Uniflow.\n", - " In this example, we use the base `Config` defaults with the [OpenAIModelConfig] to generate questions and answers.']'\n", - " 'code' cell: '['config = Config(model_config=OpenAIModelConfig())', 'client = Client(config)']'\"\"\",\n", - " question=\"How to use LLM to generate data in Uniflow\",\n", - " answer=\"We can use the Uniflow's default [OpenAIModelConfig] to generate questions and answers with code: '['config = Config(model_config=OpenAIModelConfig())', 'client = Client(config)']'\",\n", + " context=context,\n", + " question=question,\n", + " answer=answer,\n", " )\n", " ]\n", ")" diff --git a/example/transform/openai_model.ipynb b/example/transform/openai_model.ipynb index 9a023488..4f64a03e 100644 --- a/example/transform/openai_model.ipynb +++ b/example/transform/openai_model.ipynb @@ -120,7 +120,7 @@ "metadata": {}, "outputs": [], "source": [ - "raw_context_input = [\"It was a sunny day and the sky color is blue.\", \"My name is bobby and I am a talent software engineer working on AI/ML.\"]" + "raw_context_input = [\"\"\"It was a sunny day and the sky color is blue.\"\"\", \"\"\"My name is bobby and I am a talent software engineer working on AI/ML.\"\"\"]" ] }, { diff --git a/example/transform/openai_pdf_source_10k_QA.ipynb b/example/transform/openai_pdf_source_10k_QA.ipynb index d563dae7..2a99769e 100644 --- a/example/transform/openai_pdf_source_10k_QA.ipynb +++ b/example/transform/openai_pdf_source_10k_QA.ipynb @@ -215,14 +215,21 @@ "metadata": {}, "outputs": [], "source": [ + "instruction=\"\"\"Generate one question and its corresponding answer based on the last context in the last \\\n", + "example. Follow the format of the examples below to include context, question, and answer in the response\"\"\"\n", + "\n", + "context=\"\"\"In 1948, Claude E. Shannon published A Mathematical Theory of\\nCommunication (Shannon, 1948) \\\n", + "establishing the theory of\\ninformation. \\In his article, Shannon introduced the concept of\\ninformation \\\n", + "entropy for the first time. We will begin our journey here.\"\"\"\n", + "answer=\"\"\"Claude E. Shannon.\"\"\"\n", + "\n", "guided_prompt = PromptTemplate(\n", - " instruction=\"\"\"Generate one question and its corresponding answer based on the last context in the last\n", - " example. Follow the format of the examples below to include context, question, and answer in the response\"\"\",\n", + " instruction=instruction,\n", " few_shot_prompt=[\n", " Context(\n", - " context=\"In 1948, Claude E. Shannon published A Mathematical Theory of\\nCommunication (Shannon, 1948) establishing the theory of\\ninformation. In his article, Shannon introduced the concept of\\ninformation entropy for the first time. We will begin our journey here.\",\n", - " question=\"Who published A Mathematical Theory of Communication in 1948?\",\n", - " answer=\"Claude E. Shannon.\",\n", + " context=context,\n", + " question=\"\"\"Who published A Mathematical Theory of Communication in 1948?\"\"\",\n", + " answer=answer,\n", " ),\n", "])" ] diff --git a/example/transform/openai_pdf_source_10k_summary.ipynb b/example/transform/openai_pdf_source_10k_summary.ipynb index 5d6922fb..2d9632e2 100644 --- a/example/transform/openai_pdf_source_10k_summary.ipynb +++ b/example/transform/openai_pdf_source_10k_summary.ipynb @@ -187,12 +187,24 @@ "metadata": {}, "outputs": [], "source": [ + "instruction=\"\"\"Generate a one sentence summary based on the last context below. \\\n", + "Follow the format of the examples below to include context and summary in the response\"\"\"\n", + "\n", + "context=\"\"\"When you're operating on the maker's schedule, meetings are a disaster. \\\n", + "A single meeting can blow a whole afternoon, by breaking it into two pieces each too small \\\n", + "to do anything hard in. Plus you have to remember to go to the meeting. That's no problem \\\n", + "for someone on the manager's schedule. There's always something coming on the next hour; \\\n", + "the only question is what. But when someone on the maker's schedule has a meeting, \\\n", + "they have to think about it.\"\"\"\n", + "summary=\"\"\"Meetings disrupt the productivity of those following a maker's schedule, dividing their time \\\n", + "into impractical segments, while those on a manager's schedule are accustomed to a continuous flow of tasks.\"\"\"\n", + "\n", "guided_prompt = PromptTemplate(\n", - " instruction=\"Generate a one sentence summary based on the last context below. Follow the format of the examples below to include context and summary in the response\",\n", + " instruction=instruction,\n", " few_shot_prompt=[\n", " Context(\n", - " context=\"When you're operating on the maker's schedule, meetings are a disaster. A single meeting can blow a whole afternoon, by breaking it into two pieces each too small to do anything hard in. Plus you have to remember to go to the meeting. That's no problem for someone on the manager's schedule. There's always something coming on the next hour; the only question is what. But when someone on the maker's schedule has a meeting, they have to think about it.\",\n", - " summary=\"Meetings disrupt the productivity of those following a maker's schedule, dividing their time into impractical segments, while those on a manager's schedule are accustomed to a continuous flow of tasks.\",\n", + " context=context,\n", + " summary=summary,\n", " ),\n", " ],\n", ")" diff --git a/example/transform/self_instruct_custom_html_source.ipynb b/example/transform/self_instruct_custom_html_source.ipynb index 60caca7d..0b0c8f6d 100644 --- a/example/transform/self_instruct_custom_html_source.ipynb +++ b/example/transform/self_instruct_custom_html_source.ipynb @@ -135,17 +135,26 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "instruction=\"\"\"Generate one question and its corresponding answer based on context. \\\n", + "Following the format of the examples below to include the same context, question, and answer in the response.\"\"\"\n", + "\n", + "context=\"\"\"In 1948, Claude E. Shannon published A Mathematical Theory of\\nCommunication (Shannon, 1948) \\\n", + "establishing the theory of\\ninformation. In his article, Shannon introduced the concept of\\ninformation \\\n", + "entropy for the first time. We will begin our journey here.\"\"\"\n", + "question=\"\"\"Who published A Mathematical Theory of Communication in 1948?\"\"\"\n", + "answer=\"\"\"Claude E. Shannon.\"\"\"\n", + "\n", "guided_prompt = PromptTemplate(\n", - " instruction=\"Generate one question and its corresponding answer based on context. Following the format of the examples below to include the same context, question, and answer in the response.\",\n", + " instruction=instruction,\n", " few_shot_prompt=[\n", " Context(\n", - " context=\"In 1948, Claude E. Shannon published A Mathematical Theory of\\nCommunication (Shannon, 1948) establishing the theory of\\ninformation. In his article, Shannon introduced the concept of\\ninformation entropy for the first time. We will begin our journey here.\",\n", - " question=\"Who published A Mathematical Theory of Communication in 1948?\",\n", - " answer=\"Claude E. Shannon.\",\n", + " context=context,\n", + " question=question,\n", + " answer=answer,\n", " )\n", " ]\n", ")\n", diff --git a/example/transform/self_instruct_pdf_source.ipynb b/example/transform/self_instruct_pdf_source.ipynb index 5d250908..06c9c68d 100644 --- a/example/transform/self_instruct_pdf_source.ipynb +++ b/example/transform/self_instruct_pdf_source.ipynb @@ -130,13 +130,22 @@ "metadata": {}, "outputs": [], "source": [ + "instruction=\"\"\"Generate one question and its corresponding answer based on the context. \\\n", + "Following the format of the examples below to include the same context, question, and answer in the response.\"\"\"\n", + "\n", + "context=\"\"\"In 1948, Claude E. Shannon published A Mathematical Theory of\\nCommunication (Shannon, 1948) \\\n", + "establishing the theory of\\ninformation. In his article, Shannon introduced the concept of\\ninformation entropy \\\n", + "for the first time. We will begin our journey here.\"\"\"\n", + "question=\"\"\"Who published A Mathematical Theory of Communication in 1948?\"\"\"\n", + "answer=\"\"\"Claude E. Shannon.\"\"\"\n", + "\n", "guided_prompt = PromptTemplate(\n", - " instruction=\"Generate one question and its corresponding answer based on the context. Following the format of the examples below to include the same context, question, and answer in the response.\",\n", + " instruction=instruction,\n", " few_shot_prompt=[\n", " Context(\n", - " context=\"In 1948, Claude E. Shannon published A Mathematical Theory of\\nCommunication (Shannon, 1948) establishing the theory of\\ninformation. In his article, Shannon introduced the concept of\\ninformation entropy for the first time. We will begin our journey here.\",\n", - " question=\"Who published A Mathematical Theory of Communication in 1948?\",\n", - " answer=\"Claude E. Shannon.\"\n", + " context=context,\n", + " question=question,\n", + " answer=answer\n", " ),\n", " ]\n", ")\n", diff --git a/uniflow/flow/config.py b/uniflow/flow/config.py index cbc62f0e..0f1a89b5 100644 --- a/uniflow/flow/config.py +++ b/uniflow/flow/config.py @@ -38,7 +38,7 @@ class ExtractConfig: class ExtractTxtConfig(ExtractConfig): """Extract Txt Config Class.""" - flow_name: str = "ExtractTxtFlow" + flow_name: str = """ExtractTxtFlow""" @dataclass @@ -52,7 +52,7 @@ class ExtractS3TxtConfig(ExtractConfig): class ExtractPDFConfig(ExtractConfig): """Nougat Config Class.""" - flow_name: str = "ExtractPDFFlow" + flow_name: str = """ExtractPDFFlow""" model_config: ModelConfig = field(default_factory=NougatModelConfig) splitter: str = PARAGRAPH_SPLITTER @@ -61,7 +61,7 @@ class ExtractPDFConfig(ExtractConfig): class ExtractImageConfig(ExtractConfig): """Extract Image Config Class""" - flow_name: str = "ExtractImageFlow" + flow_name: str = """ExtractImageFlow""" model_config: ModelConfig = field(default_factory=LayoutModelConfig()) splitter: str = PARAGRAPH_SPLITTER @@ -70,7 +70,7 @@ class ExtractImageConfig(ExtractConfig): class ExtractMarkdownConfig(ExtractConfig): """Extract Markdown Config Class.""" - flow_name: str = "ExtractMarkdownFlow" + flow_name: str = """ExtractMarkdownFlow""" splitter: str = MARKDOWN_HEADER_SPLITTER @@ -78,7 +78,7 @@ class ExtractMarkdownConfig(ExtractConfig): class ExtractIpynbConfig(ExtractConfig): """Extract ipynb Config Class.""" - flow_name: str = "ExtractIpynbFlow" + flow_name: str = """ExtractIpynbFlow""" @dataclass @@ -103,20 +103,18 @@ class TransformConfig: num_thread: int = 1 prompt_template: PromptTemplate = field( default_factory=lambda: PromptTemplate( - instruction=""" - Generate one question and its corresponding answer based on the last context in the last - example. Follow the format of the examples below to include context, question, and answer in the response. - """, + instruction="""Generate one question and its corresponding answer based on the last context in the last \ +example. Follow the format of the examples below to include context, question, and answer in the response.""", few_shot_prompt=[ Context( - context="The quick brown fox jumps over the lazy black dog.", - question="What is the color of the fox?", - answer="brown.", + context="""The quick brown fox jumps over the lazy black dog.""", + question="""What is the color of the fox?""", + answer="""brown.""", ), Context( - context="The quick brown fox jumps over the lazy black dog.", - question="What is the color of the dog?", - answer="black.", + context="""The quick brown fox jumps over the lazy black dog.""", + question="""What is the color of the dog?""", + answer="""black.""", ), ], ) @@ -127,7 +125,7 @@ class TransformConfig: class TransformOpenAIConfig(TransformConfig): """Transform OpenAI Config Class.""" - flow_name: str = "TransformOpenAIFlow" + flow_name: str = """TransformOpenAIFlow""" model_config: ModelConfig = field(default_factory=OpenAIModelConfig) @@ -135,7 +133,7 @@ class TransformOpenAIConfig(TransformConfig): class TransformHuggingFaceConfig(TransformConfig): """Transform Hugging Face Config Class.""" - flow_name: str = "TransformHuggingFaceFlow" + flow_name: str = """TransformHuggingFaceFlow""" model_config: ModelConfig = field(default_factory=HuggingfaceModelConfig) @@ -143,7 +141,7 @@ class TransformHuggingFaceConfig(TransformConfig): class TransformQAHuggingFaceConfig(TransformConfig): """Transform Hugging Face Config Class for raw response format.""" - flow_name: str = "TransformHuggingFaceFlow" + flow_name: str = """TransformHuggingFaceFlow""" model_config: ModelConfig = field( default_factory=lambda: HuggingfaceModelConfig( response_start_key="question", response_format={"type": "text"} @@ -152,20 +150,18 @@ class TransformQAHuggingFaceConfig(TransformConfig): num_thread: int = 1 prompt_template: PromptTemplate = field( default_factory=lambda: PromptTemplate( - instruction=""" - Generate one question and its corresponding answer based on the last context in the last - example. Follow the format of the examples below to include context, question, and answer in the response. - """, + instruction="""Generate one question and its corresponding answer based on the last context in the last \ +example. Follow the format of the examples below to include context, question, and answer in the response.""", few_shot_prompt=[ Context( - context="The quick brown fox jumps over the lazy black dog.", - question="What is the color of the fox?", - answer="brown.", + context="""The quick brown fox jumps over the lazy black dog.""", + question="""What is the color of the fox?""", + answer="""brown.""", ), Context( - context="The quick brown fox jumps over the lazy black dog.", - question="What is the color of the dog?", - answer="black.", + context="""The quick brown fox jumps over the lazy black dog.""", + question="""What is the color of the dog?""", + answer="""black.""", ), ], ) @@ -176,7 +172,7 @@ class TransformQAHuggingFaceConfig(TransformConfig): class TransformQAHuggingFaceJsonFormatConfig(TransformConfig): """Transform Hugging Face QA Config Class for Json response format.""" - flow_name: str = "TransformHuggingFaceFlow" + flow_name: str = """TransformHuggingFaceFlow""" # model will start generating response starting from # question, so the response start key is question. # this is very important for the model to generate valid json response. @@ -188,20 +184,18 @@ class TransformQAHuggingFaceJsonFormatConfig(TransformConfig): num_thread: int = 1 prompt_template: PromptTemplate = field( default_factory=lambda: PromptTemplate( - instruction=""" - Generate one question and its corresponding answer based on the last context in the last - example. Follow the format of the examples below to include context, question, and answer in the response. - """, + instruction="""Generate one question and its corresponding answer based on the last context in the last \ +example. Follow the format of the examples below to include context, question, and answer in the response.""", few_shot_prompt=[ Context( - context="The quick brown fox jumps over the lazy black dog.", - question="What is the color of the fox?", - answer="brown.", + context="""The quick brown fox jumps over the lazy black dog.""", + question="""What is the color of the fox?""", + answer="""brown.""", ), Context( - context="The quick brown fox jumps over the lazy black dog.", - question="What is the color of the dog?", - answer="black.", + context="""The quick brown fox jumps over the lazy black dog.""", + question="""What is the color of the dog?""", + answer="""black.""", ), ], ) @@ -212,9 +206,9 @@ class TransformQAHuggingFaceJsonFormatConfig(TransformConfig): class TransformLMQGConfig(TransformConfig): """Transform LMQG Config Class.""" - flow_name: str = "TransformLMQGFlow" + flow_name: str = """TransformLMQGFlow""" prompt_template: PromptTemplate = field( - default_factory=lambda: PromptTemplate(instruction="", few_shot_prompt=[]) + default_factory=lambda: PromptTemplate(instruction="""""", few_shot_prompt=[]) ) model_config: ModelConfig = field(default_factory=LMQGModelConfig) @@ -223,16 +217,16 @@ class TransformLMQGConfig(TransformConfig): class TransformCopyConfig(TransformConfig): """Transform Linear Config Class.""" - flow_name: str = "TransformCopyFlow" + flow_name: str = """TransformCopyFlow""" prompt_template: PromptTemplate = field( - default_factory=lambda: PromptTemplate(instruction="", few_shot_prompt=[]) + default_factory=lambda: PromptTemplate(instruction="""""", few_shot_prompt=[]) ) model_config: ModelConfig = field(default_factory=lambda: {}) @dataclass class TransformForGenerationOpenAIGPT3p5Config(TransformConfig): - flow_name: str = "TransformOpenAIFlow" + flow_name: str = """TransformOpenAIFlow""" model_config: ModelConfig = field( default_factory=lambda: OpenAIModelConfig( model_name="gpt-3.5-turbo-1106", @@ -244,16 +238,14 @@ class TransformForGenerationOpenAIGPT3p5Config(TransformConfig): ) prompt_template: PromptTemplate = field( default_factory=lambda: PromptTemplate( - instruction=""" - Your role is to explore the forefront of technological developments. Examine the text for mentions of state-of-the-art technology applications, innovative methods, or emerging areas of innovation. If present, list each technology by name in a string format. If none are mentioned, return an empty list. Ensure the response is always in a list format. - """, + instruction="""Your role is to explore the forefront of technological developments. Examine the text for mentions of state-of-the-art technology applications, innovative methods, or emerging areas of innovation. If present, list each technology by name in a string format. If none are mentioned, return an empty list. Ensure the response is always in a list format.""", few_shot_prompt=[ Context( - context="Our new business wins are supported by our product leadership strategy of bringing new product to market that provides value for our customers, such as market-leading 500 bar GDi technology, helping customers improve efficiency, reduce emissions and lower costs leveraging our GDi technology and capital to provide a value-focused solution for our off-highway diesel applications and hydrogen ICE that differentiates us from our competition. We're helping our customers move towards carbon neutral and carbon-free fuels with solutions using ethanol, biofuels and hydrogen, as it's our view that a liquefied or gaseous fuel is going to be a key element of our journey to carbon neutrality.", - answer=["500 bar GDi technology", "carbon neutral"], + context="""Our new business wins are supported by our product leadership strategy of bringing new product to market that provides value for our customers, such as market-leading 500 bar GDi technology, helping customers improve efficiency, reduce emissions and lower costs leveraging our GDi technology and capital to provide a value-focused solution for our off-highway diesel applications and hydrogen ICE that differentiates us from our competition. We're helping our customers move towards carbon neutral and carbon-free fuels with solutions using ethanol, biofuels and hydrogen, as it's our view that a liquefied or gaseous fuel is going to be a key element of our journey to carbon neutrality.""", + answer=["""500 bar GDi technology""", """carbon neutral"""], ), Context( - context="The Eiffel Tower, located in Paris, France, is one of the most famous landmarks in the world. It was constructed in 1889 and stands at a height of 324 meters.", + context="""The Eiffel Tower, located in Paris, France, is one of the most famous landmarks in the world. It was constructed in 1889 and stands at a height of 324 meters.""", answer=[], ), ], @@ -263,7 +255,7 @@ class TransformForGenerationOpenAIGPT3p5Config(TransformConfig): @dataclass class TransformForClusteringOpenAIGPT4Config: - flow_name: str = "TransformOpenAIFlow" + flow_name: str = """TransformOpenAIFlow""" model_config: ModelConfig = field( default_factory=lambda: OpenAIModelConfig( model_name="gpt-4-1106-preview", @@ -275,53 +267,59 @@ class TransformForClusteringOpenAIGPT4Config: ) prompt_template: PromptTemplate = field( default_factory=lambda: PromptTemplate( - instruction=""" - Your task as a technology expert is to categorize a list of tech terms. First, cluster these terms into distinct groups based on their semantic similarities, where each group encapsulates a specific technological concept. Second, within these clusters, identify and merge terms that are essentially synonymous. Your final output should be a well-structured dictionary, where each key signifies a unique category of technology, and its corresponding value is a list of technology terms. - """, + instruction="""Your task as a technology expert is to categorize a list of tech terms. First, cluster these terms into distinct groups based on their semantic similarities, where each group encapsulates a specific technological concept. Second, within these clusters, identify and merge terms that are essentially synonymous. Your final output should be a well-structured dictionary, where each key signifies a unique category of technology, and its corresponding value is a list of technology terms.""", few_shot_prompt=[ Context( context=[ - "artificial intelligence", - "AI", - "500 bar GDi technology", - "ML", - "500 bar GDi", - "machine learning", + """artificial intelligence""", + """AI""", + """500 bar GDi technology""", + """ML""", + """500 bar GDi""", + """machine learning""", ], answer={ - "500_BAR_GDI": ["500 bar GDi"], - "AIML": ["AI", "ML"], + """500_BAR_GDI""": ["""500 bar GDi"""], + """AIML""": ["""AI""", """ML"""], }, ), Context( context=[ - "cryptocurrency", - "blockchain", - "Bitcoin", - "Ethereum", - "digital currency", - "crypto mining", - "mRNA vaccine", - "gene editing", - "CRISPR", - "Ethereum platform", - "Ether", - "NFTs", - "DNA sequencing", - "bioinformatics", - "mRNA therapy", + """cryptocurrency""", + """blockchain""", + """Bitcoin""", + """Ethereum""", + """digital currency""", + """crypto mining""", + """mRNA vaccine""", + """gene editing""", + """CRISPR""", + """Ethereum platform""", + """Ether""", + """NFTs""", + """DNA sequencing""", + """bioinformatics""", + """mRNA therapy""", ], answer={ - "BIO_TECH": [ - "mRNA vaccine", - "gene editing", - "CRISPR", - "DNA sequencing", - "bioinformatics", - "mRNA therapy", + """BIO_TECH""": [ + """mRNA vaccine""", + """gene editing""", + """CRISPR""", + """DNA sequencing""", + """bioinformatics""", + """mRNA therapy""", + ], + """BLOCKCHAIN_TECH""": [ + """blockchain""", + """crypto mining""", + """NFTs""", + ], + """CRYPTOCURRENCY""": [ + """Bitcoin""", + """cryptocurrency""", + """Ethereum""", ], - "BLOCKCHAIN_TECH": ["blockchain", "crypto mining", "NFTs"], - "CRYPTOCURRENCY": ["Bitcoin", "cryptocurrency", "Ethereum"], }, ), ], @@ -336,7 +334,7 @@ class TransformForClusteringOpenAIGPT4Config: class RaterConfig: """Rater Config Class.""" - flow_name: str = "RaterFlow" + flow_name: str = """RaterFlow""" model_config: ModelConfig = field(default_factory=ModelConfig) label2score: Dict[str, float] = field(default_factory=dict) prompt_template: PromptTemplate = field(default_factory=PromptTemplate) @@ -356,11 +354,11 @@ def __post_init__(self): missing_labels = incompatible_labels["missing_labels"] if unexpected_labels: raise ValueError( - "Inconsistent labels found in prompt_template examples, " - f"example label {unexpected_labels} not in label2score has keys {list(self.label2score.keys())}", + """Inconsistent labels found in prompt_template examples, """ + f"""example label {unexpected_labels} not in label2score has keys {list(self.label2score.keys())}""", ) if missing_labels: - print(f"The label2score label {missing_labels} not in example label.") + print(f"""The label2score label {missing_labels} not in example label.""") # batch_size must be divisible by num_return_sequences for HuggingfaceModelConfig only # This might need to be extended to other model configs in the future. if isinstance(self.model_config, HuggingfaceModelConfig): @@ -369,8 +367,8 @@ def __post_init__(self): != 0 # noqa E501 ): raise ValueError( - f"batch_size {self.model_config.batch_size} must be divisible by" - f"num_return_sequences {self.model_config.num_return_sequences}" + f"""batch_size {self.model_config.batch_size} must be divisible by""" + f"""num_return_sequences {self.model_config.num_return_sequences}""" ) def check_labels(self) -> Dict[str, list]: @@ -428,26 +426,24 @@ class RaterForClassificationOpenAIGPT4Config(RaterConfig): ) prompt_template: PromptTemplate = field( default_factory=lambda: PromptTemplate( - instruction=""" - Evaluate the appropriateness of a given answer based on the question and the context. - There are few examples below, consisting of context, question, answer, explanation and label. - If answer is appropriate, you should give a label representing higher score and vise versa. Check label to score dictionary: {label2score}. - Your response should only focus on the unlabeled sample, including two fields: explanation and label (one of {label_list}). - """, + instruction="""Evaluate the appropriateness of a given answer based on the question and the context. \ +There are few examples below, consisting of context, question, answer, explanation and label. \ +If answer is appropriate, you should give a label representing higher score and vise versa. Check label to score dictionary: {label2score}. \ +Your response should only focus on the unlabeled sample, including two fields: explanation and label (one of {label_list}).""", few_shot_prompt=[ Context( - context="The Eiffel Tower, located in Paris, France, is one of the most famous landmarks in the world. It was constructed in 1889 and stands at a height of 324 meters.", - question="When was the Eiffel Tower constructed?", - answer="The Eiffel Tower was constructed in 1889.", - explanation="The context explicitly mentions that the Eiffel Tower was constructed in 1889, so the answer is correct.", - label="Yes", + context="""The Eiffel Tower, located in Paris, France, is one of the most famous landmarks in the world. It was constructed in 1889 and stands at a height of 324 meters.""", + question="""When was the Eiffel Tower constructed?""", + answer="""The Eiffel Tower was constructed in 1889.""", + explanation="""The context explicitly mentions that the Eiffel Tower was constructed in 1889, so the answer is correct.""", + label="""Yes""", ), Context( - context="Photosynthesis is a process used by plants to convert light energy into chemical energy. This process primarily occurs in the chloroplasts of plant cells.", - question="Where does photosynthesis primarily occur in plant cells?", - answer="Photosynthesis primarily occurs in the mitochondria of plant cells.", - explanation="The context mentions that photosynthesis primarily occurs in the chloroplasts of plant cells, so the answer is incorrect.", - label="No", + context="""Photosynthesis is a process used by plants to convert light energy into chemical energy. This process primarily occurs in the chloroplasts of plant cells.""", + question="""Where does photosynthesis primarily occur in plant cells?""", + answer="""Photosynthesis primarily occurs in the mitochondria of plant cells.""", + explanation="""The context mentions that photosynthesis primarily occurs in the chloroplasts of plant cells, so the answer is incorrect.""", + label="""No""", ), ], ) @@ -498,18 +494,18 @@ class RaterForClassificationOpenAIGPT3p5Config(RaterConfig): """, few_shot_prompt=[ Context( - context="The Eiffel Tower, located in Paris, France, is one of the most famous landmarks in the world. It was constructed in 1889 and stands at a height of 324 meters.", - question="When was the Eiffel Tower constructed?", - answer="The Eiffel Tower was constructed in 1889.", - explanation="The context explicitly mentions that the Eiffel Tower was constructed in 1889, so the answer is correct.", - label="Yes", + context="""The Eiffel Tower, located in Paris, France, is one of the most famous landmarks in the world. It was constructed in 1889 and stands at a height of 324 meters.""", + question="""When was the Eiffel Tower constructed?""", + answer="""The Eiffel Tower was constructed in 1889.""", + explanation="""The context explicitly mentions that the Eiffel Tower was constructed in 1889, so the answer is correct.""", + label="""Yes""", ), Context( - context="Photosynthesis is a process used by plants to convert light energy into chemical energy. This process primarily occurs in the chloroplasts of plant cells.", - question="Where does photosynthesis primarily occur in plant cells?", - answer="Photosynthesis primarily occurs in the mitochondria of plant cells.", - explanation="The context mentions that photosynthesis primarily occurs in the chloroplasts of plant cells, so the answer is incorrect.", - label="No", + context="""Photosynthesis is a process used by plants to convert light energy into chemical energy. This process primarily occurs in the chloroplasts of plant cells.""", + question="""Where does photosynthesis primarily occur in plant cells?""", + answer="""Photosynthesis primarily occurs in the mitochondria of plant cells.""", + explanation="""The context mentions that photosynthesis primarily occurs in the chloroplasts of plant cells, so the answer is incorrect.""", + label="""No""", ), ], ) @@ -547,18 +543,18 @@ class RaterForClassificationBedrockClaudeConfig(RaterConfig): """, few_shot_prompt=[ Context( - context="The Eiffel Tower, located in Paris, France, is one of the most famous landmarks in the world. It was constructed in 1889 and stands at a height of 324 meters.", - question="When was the Eiffel Tower constructed?", - answer="The Eiffel Tower was constructed in 1889.", - explanation="The context explicitly mentions that the Eiffel Tower was constructed in 1889, so the answer is correct.", - label="Yes", + context="""The Eiffel Tower, located in Paris, France, is one of the most famous landmarks in the world. It was constructed in 1889 and stands at a height of 324 meters.""", + question="""When was the Eiffel Tower constructed?""", + answer="""The Eiffel Tower was constructed in 1889.""", + explanation="""The context explicitly mentions that the Eiffel Tower was constructed in 1889, so the answer is correct.""", + label="""Yes""", ), Context( - context="Photosynthesis is a process used by plants to convert light energy into chemical energy. This process primarily occurs in the chloroplasts of plant cells.", - question="Where does photosynthesis primarily occur in plant cells?", - answer="Photosynthesis primarily occurs in the mitochondria of plant cells.", - explanation="The context mentions that photosynthesis primarily occurs in the chloroplasts of plant cells, so the answer is incorrect.", - label="No", + context="""Photosynthesis is a process used by plants to convert light energy into chemical energy. This process primarily occurs in the chloroplasts of plant cells.""", + question="""Where does photosynthesis primarily occur in plant cells?""", + answer="""Photosynthesis primarily occurs in the mitochondria of plant cells.""", + explanation="""The context mentions that photosynthesis primarily occurs in the chloroplasts of plant cells, so the answer is incorrect.""", + label="""No""", ), ], ) @@ -581,7 +577,7 @@ class RaterForClassificationSageMakerEndpointConfig(RaterConfig): answer, label, and explanation for each case. """ - flow_name: str = "RaterFlow" + flow_name: str = """RaterFlow""" model_config: ModelConfig = field(default_factory=SageMakerModelConfig) label2score: Dict[str, float] = field( default_factory=lambda: {"Yes": 1.0, "No": 0.0} @@ -595,20 +591,20 @@ class RaterForClassificationSageMakerEndpointConfig(RaterConfig): Context( context="""The Eiffel Tower, located in Paris, France, is one of the most famous landmarks in the world. It was constructed in 1889 and stands at a height of 324 meters.""", - question="When was the Eiffel Tower constructed?", - answer="The Eiffel Tower was constructed in 1889.", + question="""When was the Eiffel Tower constructed?""", + answer="""The Eiffel Tower was constructed in 1889.""", explanation="""The context explicitly mentions that the Eiffel Tower was constructed in 1889, so the answer is correct.""", - label="Yes", + label="""Yes""", ), Context( context="""Photosynthesis is a process used by plants to convert light energy into chemical energy. This process primarily occurs in the chloroplasts of plant cells.""", - question="Where does photosynthesis primarily occur in plant cells?", - answer="Photosynthesis primarily occurs in the mitochondria of plant cells.", + question="""Where does photosynthesis primarily occur in plant cells?""", + answer="""Photosynthesis primarily occurs in the mitochondria of plant cells.""", explanation="""The context mentions that photosynthesis primarily occurs in the chloroplasts of plant cells, so the answer is incorrect.""", - label="No", + label="""No""", ), ], ) @@ -634,7 +630,7 @@ class RaterForClassificationHuggingfaceConfig(RaterConfig): model_config: ModelConfig = field( default_factory=HuggingfaceModelConfig( - response_start_key="explanation", + response_start_key="""explanation""", response_format={"type": "text"}, batch_size=1, ) @@ -648,18 +644,18 @@ class RaterForClassificationHuggingfaceConfig(RaterConfig): Follow the format of the examples below, consisting of context, question, answer, explanation and label (you must choose one from {label_list}).""", few_shot_prompt=[ Context( - context="The Eiffel Tower, located in Paris, France, is one of the most famous landmarks in the world. It was constructed in 1889 and stands at a height of 324 meters.", - question="When was the Eiffel Tower constructed?", - answer="The Eiffel Tower was constructed in 1889.", - explanation="The answer is consistency to the fact that Eiffel Tower was constructed in 1889 mentioned in context, so the answer is correct.", - label="Yes", + context="""The Eiffel Tower, located in Paris, France, is one of the most famous landmarks in the world. It was constructed in 1889 and stands at a height of 324 meters.""", + question="""When was the Eiffel Tower constructed?""", + answer="""The Eiffel Tower was constructed in 1889.""", + explanation="""The answer is consistency to the fact that Eiffel Tower was constructed in 1889 mentioned in context, so the answer is correct.""", + label="""Yes""", ), Context( - context="Photosynthesis is a process used by plants to convert light energy into chemical energy. This process primarily occurs in the chloroplasts of plant cells.", - question="Where does photosynthesis primarily occur in plant cells?", - answer="Photosynthesis primarily occurs in the mitochondria of plant cells.", - explanation="The context mentions that photosynthesis primarily occurs in the chloroplasts of plant cells but not mitochondria indicated by answer, so the answer is incorrect.", - label="No", + context="""Photosynthesis is a process used by plants to convert light energy into chemical energy. This process primarily occurs in the chloroplasts of plant cells.""", + question="""Where does photosynthesis primarily occur in plant cells?""", + answer="""Photosynthesis primarily occurs in the mitochondria of plant cells.""", + explanation="""The context mentions that photosynthesis primarily occurs in the chloroplasts of plant cells but not mitochondria indicated by answer, so the answer is incorrect.""", + label="""No""", ), ], ) @@ -710,11 +706,11 @@ class RaterForGeneratedAnswerOpenAIGPT4Config(RaterConfig): """, few_shot_prompt=[ Context( - context="Early computers were built to perform a series of single tasks, like a calculator.", - question="Did early computers function like modern calculators?", - grounding_answer="No. Early computers were used primarily for complex calculating.", - generated_answer="Yes. Early computers were built to perform a series of single tasks, similar to a calculator.", - explanation="The generated answer is better because it correctly figures out early computers was used to perform single tasks akin to calculators while grounding answer not. So we accept generated answer.", + context="""Early computers were built to perform a series of single tasks, like a calculator.""", + question="""Did early computers function like modern calculators?""", + grounding_answer="""No. Early computers were used primarily for complex calculating.""", + generated_answer="""Yes. Early computers were built to perform a series of single tasks, similar to a calculator.""", + explanation="""The generated answer is better because it correctly figures out early computers was used to perform single tasks akin to calculators while grounding answer not. So we accept generated answer.""", label="accept", ), ], @@ -774,27 +770,27 @@ class RaterForGeneratedAnswerOpenAIGPT3p5Config(RaterConfig): """, few_shot_prompt=[ Context( - context="Early computers were built to perform a series of single tasks, like a calculator.", - question="Did early computers function like modern calculators?", - grounding_answer="No. Early computers were used primarily for complex calculating.", - generated_answer="Yes. Early computers were built to perform a series of single tasks, similar to a calculator.", - explanation="The generated answer is better because it correctly figures out early computers was used to perform single tasks akin to calculators.", + context="""Early computers were built to perform a series of single tasks, like a calculator.""", + question="""Did early computers function like modern calculators?""", + grounding_answer="""No. Early computers were used primarily for complex calculating.""", + generated_answer="""Yes. Early computers were built to perform a series of single tasks, similar to a calculator.""", + explanation="""The generated answer is better because it correctly figures out early computers was used to perform single tasks akin to calculators.""", label="accept", ), Context( - context="Operating systems(OS) did not exist in their modern and more complex forms until the early 1960s.", - question="When did operating systems start to resemble their modern forms?", - grounding_answer="Operating systems started to resemble their modern forms in the early 1960s.", - generated_answer="Modern and more complex forms of operating systems began to emerge in the early 1960s.", - explanation="The generated answer is as equally good as grounding answer because they both accurately pinpoint the early 1960s as the period when modern operating systems began to develop.", + context="""Operating systems(OS) did not exist in their modern and more complex forms until the early 1960s.""", + question="""When did operating systems start to resemble their modern forms?""", + grounding_answer="""Operating systems started to resemble their modern forms in the early 1960s.""", + generated_answer="""Modern and more complex forms of operating systems began to emerge in the early 1960s.""", + explanation="""The generated answer is as equally good as grounding answer because they both accurately pinpoint the early 1960s as the period when modern operating systems began to develop.""", label="equivalent", ), Context( - context="Hardware features were added, that enabled use of runtime libraries, interrupts, and parallel processing in the 1960s.", - question="What features were added to hardware in the 1960s?", - grounding_answer="Hardware in the 1960s saw the addition of features like runtime libraries and parallel processing.", - generated_answer="The 1960s saw the addition of input output control and compatible timesharing capabilities in hardware.", - explanation="The generated answer is worse because it inaccurately suggests the addition of capabilities of hardware in 1960s which is not supported by the context.", + context="""Hardware features were added, that enabled use of runtime libraries, interrupts, and parallel processing in the 1960s.""", + question="""What features were added to hardware in the 1960s?""", + grounding_answer="""Hardware in the 1960s saw the addition of features like runtime libraries and parallel processing.""", + generated_answer="""The 1960s saw the addition of input output control and compatible timesharing capabilities in hardware.""", + explanation="""The generated answer is worse because it inaccurately suggests the addition of capabilities of hardware in 1960s which is not supported by the context.""", label="reject", ), ], diff --git a/uniflow/node.py b/uniflow/node.py index d48c6d6a..9b9fab74 100644 --- a/uniflow/node.py +++ b/uniflow/node.py @@ -1,4 +1,4 @@ -""" Node module for uniflow.""" +"""Node module for uniflow.""" from typing import Any, Mapping, Optional, Sequence diff --git a/uniflow/op/extract/load/aws/s3_op.py b/uniflow/op/extract/load/aws/s3_op.py index b7147d34..ca51a6ad 100644 --- a/uniflow/op/extract/load/aws/s3_op.py +++ b/uniflow/op/extract/load/aws/s3_op.py @@ -18,7 +18,7 @@ class ExtractS3Op(Op): def __init__(self, name: str = "extract_s3_op") -> None: try: - import boto3 # pylint: disable=import-outside-toplevel + import boto3 except ImportError as e: raise ImportError("Please install boto3 to use S3Op.") from e diff --git a/uniflow/op/extract/load/ipynb_op.py b/uniflow/op/extract/load/ipynb_op.py index fd1b7113..17670800 100644 --- a/uniflow/op/extract/load/ipynb_op.py +++ b/uniflow/op/extract/load/ipynb_op.py @@ -12,10 +12,8 @@ class ExtractIpynbOp(Op): def __init__(self, name: str) -> None: try: - import nbformat # pylint: disable=import-outside-toplevel - from nbconvert import ( # pylint: disable=import-outside-toplevel - MarkdownExporter, - ) + import nbformat + from nbconvert import MarkdownExporter except ModuleNotFoundError as exc: raise ModuleNotFoundError( "Please install nbformat and nbconvert to load ipynb file. You can use `pip install nbformat nbconvert` to install them." diff --git a/uniflow/op/model/model_server.py b/uniflow/op/model/model_server.py index b9eb1723..02e71872 100644 --- a/uniflow/op/model/model_server.py +++ b/uniflow/op/model/model_server.py @@ -147,7 +147,7 @@ def __init__( self, prompt_template: PromptTemplate, model_config: Dict[str, Any] ) -> None: # import in class level to avoid installing openai package - from openai import OpenAI # pylint: disable=import-outside-toplevel + from openai import OpenAI super().__init__(prompt_template, model_config) self._model_config = OpenAIModelConfig(**self._model_config) @@ -221,7 +221,7 @@ def __init__( self, prompt_template: PromptTemplate, model_config: Dict[str, Any] ) -> None: # import in class level to avoid installing openai package - from openai import AzureOpenAI # pylint: disable=import-outside-toplevel + from openai import AzureOpenAI super().__init__(prompt_template, model_config) self._model_config = AzureOpenAIModelConfig(**self._model_config) @@ -295,9 +295,7 @@ def __init__( self._model_config = HuggingfaceModelConfig(**self._model_config) if self._model_config.neuron is False: try: - from transformers import ( # pylint: disable=import-outside-toplevel - pipeline, - ) + from transformers import pipeline except ModuleNotFoundError as exc: raise ModuleNotFoundError( "Please install transformers to use HuggingfaceModelServer. You can use `pip install transformers` to install it." @@ -327,9 +325,7 @@ def __init__( print( "Neuron model does not support quantized models. load_in_4bit and load_in_8bit are automatically set to False." ) - from uniflow.op.model.neuron_utils import ( # pylint: disable=import-outside-toplevel - Neuron, - ) + from uniflow.op.model.neuron_utils import Neuron model, tokenizer = Neuron.get_neuron_model( self._model_config.model_name, self._model_config.batch_size @@ -341,10 +337,7 @@ def __init__( def _get_model(self): """Get model.""" - from transformers import ( # pylint: disable=import-outside-toplevel - AutoModelForCausalLM, - AutoTokenizer, - ) + from transformers import AutoModelForCausalLM, AutoTokenizer tokenizer = AutoTokenizer.from_pretrained( self._model_config.model_name, @@ -468,7 +461,7 @@ def __init__( self, prompt_template: PromptTemplate, model_config: Dict[str, Any] ) -> None: # import in class level to avoid installing transformers package - from lmqg import TransformersQG # pylint: disable=import-outside-toplevel + from lmqg import TransformersQG super().__init__(prompt_template, model_config) self._model_config = LMQGModelConfig(**self._model_config) @@ -522,13 +515,9 @@ def __init__( ) -> None: # import in class level to avoid installing nougat package try: - from nougat import NougatModel # pylint: disable=import-outside-toplevel - from nougat.utils.checkpoint import ( # pylint: disable=import-outside-toplevel - get_checkpoint, - ) - from nougat.utils.device import ( # pylint: disable=import-outside-toplevel - move_to_device, - ) + from nougat import NougatModel + from nougat.utils.checkpoint import get_checkpoint + from nougat.utils.device import move_to_device except ModuleNotFoundError as exc: raise ModuleNotFoundError( "Please install nougat to use NougatModelServer. You can use `pip install nougat-ocr` to install it." @@ -574,16 +563,9 @@ def __call__(self, data: List[str]) -> List[str]: Returns: List[str]: Output data. """ - from nougat.postprocessing import ( # pylint: disable=import-outside-toplevel - markdown_compatible, - ) - from nougat.utils.dataset import ( # pylint: disable=import-outside-toplevel - LazyDataset, - ) - from torch.utils.data import ( # pylint: disable=import-outside-toplevel - ConcatDataset, - DataLoader, - ) + from nougat.postprocessing import markdown_compatible + from nougat.utils.dataset import LazyDataset + from torch.utils.data import ConcatDataset, DataLoader outs = [] for pdf in data: @@ -1085,16 +1067,14 @@ def __init__( super().__init__(prompt_template, model_config) self._model_config = LayoutModelConfig(**self._model_config) try: - import easyocr # pylint: disable=import-outside-toplevel + import easyocr self.reader = easyocr.Reader(self._model_config.ocr_lang) except ModuleNotFoundError as exc: raise ModuleNotFoundError( "Please install easyocr to use LayoutModelServer. You can use `pip install easyocr` to install it." ) from exc - from .layout_utils import ( # pylint: disable=import-outside-toplevel - LayoutPredictor, - ) + from .layout_utils import LayoutPredictor self.layout_predictor = LayoutPredictor( self._model_config.model_name, self._model_config.model_file @@ -1131,12 +1111,10 @@ def __call__(self, data: List[str]) -> List[str]: Returns: List[str]: Output data. """ - import cv2 # pylint: disable=import-outside-toplevel - import numpy as np # pylint: disable=import-outside-toplevel + import cv2 + import numpy as np - from uniflow.op.model.layout_utils import ( # pylint: disable=import-outside-toplevel - XYCut, - ) + from uniflow.op.model.layout_utils import XYCut outs = [] for img in data: