mirror of
https://github.com/OpenBMB/ChatDev.git
synced 2024-11-07 18:40:13 +03:00
Merge pull request #337 from NA-Wen/main
Integrate Experiential Co-Learning Module
This commit is contained in:
commit
dc6dd4284d
@ -1,6 +1,5 @@
|
|||||||
{
|
{
|
||||||
"chain": [
|
"chain": [{
|
||||||
{
|
|
||||||
"phase": "DemandAnalysis",
|
"phase": "DemandAnalysis",
|
||||||
"phaseType": "SimplePhase",
|
"phaseType": "SimplePhase",
|
||||||
"max_turn_step": -1,
|
"max_turn_step": -1,
|
||||||
@ -22,21 +21,18 @@
|
|||||||
"phase": "CodeCompleteAll",
|
"phase": "CodeCompleteAll",
|
||||||
"phaseType": "ComposedPhase",
|
"phaseType": "ComposedPhase",
|
||||||
"cycleNum": 10,
|
"cycleNum": 10,
|
||||||
"Composition": [
|
"Composition": [{
|
||||||
{
|
|
||||||
"phase": "CodeComplete",
|
"phase": "CodeComplete",
|
||||||
"phaseType": "SimplePhase",
|
"phaseType": "SimplePhase",
|
||||||
"max_turn_step": 1,
|
"max_turn_step": 1,
|
||||||
"need_reflect": "False"
|
"need_reflect": "False"
|
||||||
}
|
}]
|
||||||
]
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"phase": "CodeReview",
|
"phase": "CodeReview",
|
||||||
"phaseType": "ComposedPhase",
|
"phaseType": "ComposedPhase",
|
||||||
"cycleNum": 3,
|
"cycleNum": 3,
|
||||||
"Composition": [
|
"Composition": [{
|
||||||
{
|
|
||||||
"phase": "CodeReviewComment",
|
"phase": "CodeReviewComment",
|
||||||
"phaseType": "SimplePhase",
|
"phaseType": "SimplePhase",
|
||||||
"max_turn_step": 1,
|
"max_turn_step": 1,
|
||||||
@ -54,8 +50,7 @@
|
|||||||
"phase": "Test",
|
"phase": "Test",
|
||||||
"phaseType": "ComposedPhase",
|
"phaseType": "ComposedPhase",
|
||||||
"cycleNum": 3,
|
"cycleNum": 3,
|
||||||
"Composition": [
|
"Composition": [{
|
||||||
{
|
|
||||||
"phase": "TestErrorSummary",
|
"phase": "TestErrorSummary",
|
||||||
"phaseType": "SimplePhase",
|
"phaseType": "SimplePhase",
|
||||||
"max_turn_step": 1,
|
"max_turn_step": 1,
|
||||||
@ -99,5 +94,6 @@
|
|||||||
"web_spider": "False",
|
"web_spider": "False",
|
||||||
"self_improve": "False",
|
"self_improve": "False",
|
||||||
"incremental_develop": "False",
|
"incremental_develop": "False",
|
||||||
|
"with_memory": "False",
|
||||||
"background_prompt": "ChatDev is a software company powered by multiple intelligent agents, such as chief executive officer, chief human resources officer, chief product officer, chief technology officer, etc, with a multi-agent organizational structure and the mission of 'changing the digital world through programming'."
|
"background_prompt": "ChatDev is a software company powered by multiple intelligent agents, such as chief executive officer, chief human resources officer, chief product officer, chief technology officer, etc, with a multi-agent organizational structure and the mission of 'changing the digital world through programming'."
|
||||||
}
|
}
|
@ -27,7 +27,8 @@
|
|||||||
</p>
|
</p>
|
||||||
|
|
||||||
## 🎉 News
|
## 🎉 News
|
||||||
* **December 28, 2023: We present Experiential Co-Learning, an innovative approach where instructor and assistant agents accumulate shortcut-oriented experiences to effectively solve new tasks, reducing repetitive errors and enhancing efficiency. Check out our preprint paper at https://arxiv.org/abs/2312.17025 and this technique will soon be integrated into ChatDev.**
|
* **January 25, 2024: We integrate Experiential Co-Learning Module into ChatDev. Please see the [Experiential Co-Learning Guide](wiki.md#co-tracking).**
|
||||||
|
* December 28, 2023: We present Experiential Co-Learning, an innovative approach where instructor and assistant agents accumulate shortcut-oriented experiences to effectively solve new tasks, reducing repetitive errors and enhancing efficiency. Check out our preprint paper at https://arxiv.org/abs/2312.17025 and this technique will soon be integrated into ChatDev.
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<img src='./misc/ecl.png' width=860>
|
<img src='./misc/ecl.png' width=860>
|
||||||
</p>
|
</p>
|
||||||
|
Binary file not shown.
@ -1 +0,0 @@
|
|||||||
Subproject commit e0396448114be2e320564cdfbe6bcf4082dd4e42
|
|
Binary file not shown.
@ -28,7 +28,7 @@ from camel.utils import (
|
|||||||
num_tokens_from_messages,
|
num_tokens_from_messages,
|
||||||
openai_api_key_required,
|
openai_api_key_required,
|
||||||
)
|
)
|
||||||
|
from chatdev.utils import log_visualize
|
||||||
try:
|
try:
|
||||||
from openai.types.chat import ChatCompletion
|
from openai.types.chat import ChatCompletion
|
||||||
|
|
||||||
@ -74,6 +74,7 @@ class ChatAgent(BaseAgent):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
system_message (SystemMessage): The system message for the chat agent.
|
system_message (SystemMessage): The system message for the chat agent.
|
||||||
|
with_memory(bool): The memory setting of the chat agent.
|
||||||
model (ModelType, optional): The LLM model to use for generating
|
model (ModelType, optional): The LLM model to use for generating
|
||||||
responses. (default :obj:`ModelType.GPT_3_5_TURBO`)
|
responses. (default :obj:`ModelType.GPT_3_5_TURBO`)
|
||||||
model_config (Any, optional): Configuration options for the LLM model.
|
model_config (Any, optional): Configuration options for the LLM model.
|
||||||
@ -86,6 +87,7 @@ class ChatAgent(BaseAgent):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
system_message: SystemMessage,
|
system_message: SystemMessage,
|
||||||
|
memory = None,
|
||||||
model: Optional[ModelType] = None,
|
model: Optional[ModelType] = None,
|
||||||
model_config: Optional[Any] = None,
|
model_config: Optional[Any] = None,
|
||||||
message_window_size: Optional[int] = None,
|
message_window_size: Optional[int] = None,
|
||||||
@ -102,6 +104,10 @@ class ChatAgent(BaseAgent):
|
|||||||
self.terminated: bool = False
|
self.terminated: bool = False
|
||||||
self.info: bool = False
|
self.info: bool = False
|
||||||
self.init_messages()
|
self.init_messages()
|
||||||
|
if memory !=None and self.role_name in["Code Reviewer","Programmer","Software Test Engineer"]:
|
||||||
|
self.memory = memory.memory_data.get("All")
|
||||||
|
else:
|
||||||
|
self.memory = None
|
||||||
|
|
||||||
def reset(self) -> List[MessageType]:
|
def reset(self) -> List[MessageType]:
|
||||||
r"""Resets the :obj:`ChatAgent` to its initial state and returns the
|
r"""Resets the :obj:`ChatAgent` to its initial state and returns the
|
||||||
@ -159,6 +165,41 @@ class ChatAgent(BaseAgent):
|
|||||||
"""
|
"""
|
||||||
self.stored_messages.append(message)
|
self.stored_messages.append(message)
|
||||||
return self.stored_messages
|
return self.stored_messages
|
||||||
|
def use_memory(self,input_message) -> List[MessageType]:
|
||||||
|
if self.memory is None :
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
if self.role_name == "Programmer":
|
||||||
|
result = self.memory.memory_retrieval(input_message,"code")
|
||||||
|
if result != None:
|
||||||
|
target_memory,distances, mids,task_list,task_dir_list = result
|
||||||
|
if target_memory != None and len(target_memory) != 0:
|
||||||
|
target_memory="".join(target_memory)
|
||||||
|
#self.stored_messages[-1].content = self.stored_messages[-1].content+"Here is some code you've previously completed:"+target_memory+"You can refer to the previous script to complement this task."
|
||||||
|
log_visualize(self.role_name,
|
||||||
|
"thinking back and found some related code: \n--------------------------\n"
|
||||||
|
+ target_memory)
|
||||||
|
else:
|
||||||
|
target_memory = None
|
||||||
|
log_visualize(self.role_name,
|
||||||
|
"thinking back but find nothing useful")
|
||||||
|
|
||||||
|
else:
|
||||||
|
result = self.memory.memory_retrieval(input_message, "text")
|
||||||
|
if result != None:
|
||||||
|
target_memory, distances, mids, task_list, task_dir_list = result
|
||||||
|
if target_memory != None and len(target_memory) != 0:
|
||||||
|
target_memory=";".join(target_memory)
|
||||||
|
#self.stored_messages[-1].content = self.stored_messages[-1].content+"Here are some effective and efficient instructions you have sent to the assistant :"+target_memory+"You can refer to these previous excellent instructions to better instruct assistant here."
|
||||||
|
log_visualize(self.role_name,
|
||||||
|
"thinking back and found some related text: \n--------------------------\n"
|
||||||
|
+ target_memory)
|
||||||
|
else:
|
||||||
|
target_memory = None
|
||||||
|
log_visualize(self.role_name,
|
||||||
|
"thinking back but find nothing useful")
|
||||||
|
|
||||||
|
return target_memory
|
||||||
|
|
||||||
@retry(wait=wait_exponential(min=5, max=60), stop=stop_after_attempt(5))
|
@retry(wait=wait_exponential(min=5, max=60), stop=stop_after_attempt(5))
|
||||||
@openai_api_key_required
|
@openai_api_key_required
|
||||||
|
@ -90,13 +90,16 @@ class RolePlaying:
|
|||||||
sys_msg_generator_kwargs: Optional[Dict] = None,
|
sys_msg_generator_kwargs: Optional[Dict] = None,
|
||||||
extend_sys_msg_meta_dicts: Optional[List[Dict]] = None,
|
extend_sys_msg_meta_dicts: Optional[List[Dict]] = None,
|
||||||
extend_task_specify_meta_dict: Optional[Dict] = None,
|
extend_task_specify_meta_dict: Optional[Dict] = None,
|
||||||
background_prompt: Optional[str] = ""
|
background_prompt: Optional[str] = "",
|
||||||
|
memory = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.with_task_specify = with_task_specify
|
self.with_task_specify = with_task_specify
|
||||||
self.with_task_planner = with_task_planner
|
self.with_task_planner = with_task_planner
|
||||||
self.with_critic_in_the_loop = with_critic_in_the_loop
|
self.with_critic_in_the_loop = with_critic_in_the_loop
|
||||||
self.model_type = model_type
|
self.model_type = model_type
|
||||||
self.task_type = task_type
|
self.task_type = task_type
|
||||||
|
self.memory = memory
|
||||||
|
|
||||||
|
|
||||||
if with_task_specify:
|
if with_task_specify:
|
||||||
task_specify_meta_dict = dict()
|
task_specify_meta_dict = dict()
|
||||||
@ -148,9 +151,9 @@ class RolePlaying:
|
|||||||
meta_dict=sys_msg_meta_dicts[1],
|
meta_dict=sys_msg_meta_dicts[1],
|
||||||
content=user_role_prompt.format(**sys_msg_meta_dicts[1]))
|
content=user_role_prompt.format(**sys_msg_meta_dicts[1]))
|
||||||
|
|
||||||
self.assistant_agent: ChatAgent = ChatAgent(self.assistant_sys_msg, model_type,
|
self.assistant_agent: ChatAgent = ChatAgent(self.assistant_sys_msg, memory, model_type,
|
||||||
**(assistant_agent_kwargs or {}), )
|
**(assistant_agent_kwargs or {}), )
|
||||||
self.user_agent: ChatAgent = ChatAgent(self.user_sys_msg, model_type, **(user_agent_kwargs or {}), )
|
self.user_agent: ChatAgent = ChatAgent(self.user_sys_msg,memory, model_type, **(user_agent_kwargs or {}), )
|
||||||
|
|
||||||
if with_critic_in_the_loop:
|
if with_critic_in_the_loop:
|
||||||
raise ValueError("with_critic_in_the_loop not available")
|
raise ValueError("with_critic_in_the_loop not available")
|
||||||
@ -187,6 +190,9 @@ class RolePlaying:
|
|||||||
content = phase_prompt.format(
|
content = phase_prompt.format(
|
||||||
**({"assistant_role": self.assistant_agent.role_name} | placeholders)
|
**({"assistant_role": self.assistant_agent.role_name} | placeholders)
|
||||||
)
|
)
|
||||||
|
retrieval_memory = self.assistant_agent.use_memory(content)
|
||||||
|
if retrieval_memory!= None:
|
||||||
|
placeholders["examples"] = retrieval_memory
|
||||||
user_msg = UserChatMessage(
|
user_msg = UserChatMessage(
|
||||||
role_name=self.user_sys_msg.role_name,
|
role_name=self.user_sys_msg.role_name,
|
||||||
role="user",
|
role="user",
|
||||||
|
@ -70,7 +70,9 @@ class ChatChain:
|
|||||||
gui_design=check_bool(self.config["gui_design"]),
|
gui_design=check_bool(self.config["gui_design"]),
|
||||||
git_management=check_bool(self.config["git_management"]),
|
git_management=check_bool(self.config["git_management"]),
|
||||||
incremental_develop=check_bool(self.config["incremental_develop"]),
|
incremental_develop=check_bool(self.config["incremental_develop"]),
|
||||||
background_prompt=self.config["background_prompt"])
|
background_prompt=self.config["background_prompt"],
|
||||||
|
with_memory=check_bool(self.config["with_memory"]))
|
||||||
|
|
||||||
self.chat_env = ChatEnv(self.chat_env_config)
|
self.chat_env = ChatEnv(self.chat_env_config)
|
||||||
|
|
||||||
# the user input prompt will be self-improved (if set "self_improve": "True" in ChatChainConfig.json)
|
# the user input prompt will be self-improved (if set "self_improve": "True" in ChatChainConfig.json)
|
||||||
@ -204,6 +206,9 @@ class ChatChain:
|
|||||||
software_path = os.path.join(directory, "_".join([self.project_name, self.org_name, self.start_time]))
|
software_path = os.path.join(directory, "_".join([self.project_name, self.org_name, self.start_time]))
|
||||||
self.chat_env.set_directory(software_path)
|
self.chat_env.set_directory(software_path)
|
||||||
|
|
||||||
|
if self.chat_env.config.with_memory is True:
|
||||||
|
self.chat_env.init_memory()
|
||||||
|
|
||||||
# copy config files to software path
|
# copy config files to software path
|
||||||
shutil.copy(self.config_path, software_path)
|
shutil.copy(self.config_path, software_path)
|
||||||
shutil.copy(self.config_phase_path, software_path)
|
shutil.copy(self.config_phase_path, software_path)
|
||||||
|
@ -13,6 +13,7 @@ from chatdev.codes import Codes
|
|||||||
from chatdev.documents import Documents
|
from chatdev.documents import Documents
|
||||||
from chatdev.roster import Roster
|
from chatdev.roster import Roster
|
||||||
from chatdev.utils import log_visualize
|
from chatdev.utils import log_visualize
|
||||||
|
from ecl.memory import Memory
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from openai.types.chat.chat_completion_message_tool_call import ChatCompletionMessageToolCall
|
from openai.types.chat.chat_completion_message_tool_call import ChatCompletionMessageToolCall
|
||||||
@ -28,15 +29,18 @@ class ChatEnvConfig:
|
|||||||
gui_design,
|
gui_design,
|
||||||
git_management,
|
git_management,
|
||||||
incremental_develop,
|
incremental_develop,
|
||||||
background_prompt):
|
background_prompt,
|
||||||
|
with_memory):
|
||||||
self.clear_structure = clear_structure # Whether to clear non-software files in the WareHouse and cache files in generated software path
|
self.clear_structure = clear_structure # Whether to clear non-software files in the WareHouse and cache files in generated software path
|
||||||
self.gui_design = gui_design # Encourage ChatDev generate software with GUI
|
self.gui_design = gui_design # Encourage ChatDev generate software with GUI
|
||||||
self.git_management = git_management # Whether to use git to manage the creation and changes of generated software
|
self.git_management = git_management # Whether to use git to manage the creation and changes of generated software
|
||||||
self.incremental_develop = incremental_develop # Whether to use incremental develop on an existing project
|
self.incremental_develop = incremental_develop # Whether to use incremental develop on an existing project
|
||||||
self.background_prompt = background_prompt # background prompt that will be added to every inquiry to LLM
|
self.background_prompt = background_prompt # background prompt that will be added to every inquiry to LLM
|
||||||
|
self.with_memory = with_memory # Wheter to use memroy in the interaction between agents
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
string = ""
|
string = ""
|
||||||
|
string += "ChatEnvConfig.with_memory: {}\n".format(self.with_memory)
|
||||||
string += "ChatEnvConfig.clear_structure: {}\n".format(self.clear_structure)
|
string += "ChatEnvConfig.clear_structure: {}\n".format(self.clear_structure)
|
||||||
string += "ChatEnvConfig.git_management: {}\n".format(self.git_management)
|
string += "ChatEnvConfig.git_management: {}\n".format(self.git_management)
|
||||||
string += "ChatEnvConfig.gui_design: {}\n".format(self.gui_design)
|
string += "ChatEnvConfig.gui_design: {}\n".format(self.gui_design)
|
||||||
@ -50,6 +54,7 @@ class ChatEnv:
|
|||||||
self.config = chat_env_config
|
self.config = chat_env_config
|
||||||
self.roster: Roster = Roster()
|
self.roster: Roster = Roster()
|
||||||
self.codes: Codes = Codes()
|
self.codes: Codes = Codes()
|
||||||
|
self.memory: Memory = Memory()
|
||||||
self.proposed_images: Dict[str, str] = {}
|
self.proposed_images: Dict[str, str] = {}
|
||||||
self.incorporated_images: Dict[str, str] = {}
|
self.incorporated_images: Dict[str, str] = {}
|
||||||
self.requirements: Documents = Documents()
|
self.requirements: Documents = Documents()
|
||||||
@ -92,6 +97,13 @@ class ChatEnv:
|
|||||||
else:
|
else:
|
||||||
os.mkdir(self.env_dict['directory'])
|
os.mkdir(self.env_dict['directory'])
|
||||||
|
|
||||||
|
def init_memory(self):
|
||||||
|
self.memory.id_enabled = True
|
||||||
|
self.memory.directory = os.path.join(os.getcwd(),"ecl","memory")
|
||||||
|
if not os.path.exists(self.memory.directory):
|
||||||
|
os.mkdir(self.memory.directory)
|
||||||
|
self.memory.upload()
|
||||||
|
|
||||||
def exist_bugs(self) -> tuple[bool, str]:
|
def exist_bugs(self) -> tuple[bool, str]:
|
||||||
directory = self.env_dict['directory']
|
directory = self.env_dict['directory']
|
||||||
|
|
||||||
|
@ -59,6 +59,7 @@ class Phase(ABC):
|
|||||||
need_reflect=False,
|
need_reflect=False,
|
||||||
with_task_specify=False,
|
with_task_specify=False,
|
||||||
model_type=ModelType.GPT_3_5_TURBO,
|
model_type=ModelType.GPT_3_5_TURBO,
|
||||||
|
memory=None,
|
||||||
placeholders=None,
|
placeholders=None,
|
||||||
chat_turn_limit=10
|
chat_turn_limit=10
|
||||||
) -> str:
|
) -> str:
|
||||||
@ -102,6 +103,7 @@ class Phase(ABC):
|
|||||||
task_prompt=task_prompt,
|
task_prompt=task_prompt,
|
||||||
task_type=task_type,
|
task_type=task_type,
|
||||||
with_task_specify=with_task_specify,
|
with_task_specify=with_task_specify,
|
||||||
|
memory=memory,
|
||||||
model_type=model_type,
|
model_type=model_type,
|
||||||
background_prompt=chat_env.config.background_prompt
|
background_prompt=chat_env.config.background_prompt
|
||||||
)
|
)
|
||||||
@ -227,6 +229,7 @@ class Phase(ABC):
|
|||||||
user_role_prompt=self.counselor_prompt,
|
user_role_prompt=self.counselor_prompt,
|
||||||
placeholders={"conversations": messages, "question": question},
|
placeholders={"conversations": messages, "question": question},
|
||||||
need_reflect=False,
|
need_reflect=False,
|
||||||
|
memory=chat_env.memory,
|
||||||
chat_turn_limit=1,
|
chat_turn_limit=1,
|
||||||
model_type=self.model_type)
|
model_type=self.model_type)
|
||||||
|
|
||||||
@ -300,6 +303,7 @@ class Phase(ABC):
|
|||||||
user_role_prompt=self.user_role_prompt,
|
user_role_prompt=self.user_role_prompt,
|
||||||
chat_turn_limit=chat_turn_limit,
|
chat_turn_limit=chat_turn_limit,
|
||||||
placeholders=self.phase_env,
|
placeholders=self.phase_env,
|
||||||
|
memory=chat_env.memory,
|
||||||
model_type=self.model_type)
|
model_type=self.model_type)
|
||||||
chat_env = self.update_chat_env(chat_env)
|
chat_env = self.update_chat_env(chat_env)
|
||||||
return chat_env
|
return chat_env
|
||||||
@ -529,6 +533,7 @@ class CodeReviewHuman(Phase):
|
|||||||
user_role_prompt=self.user_role_prompt,
|
user_role_prompt=self.user_role_prompt,
|
||||||
chat_turn_limit=chat_turn_limit,
|
chat_turn_limit=chat_turn_limit,
|
||||||
placeholders=self.phase_env,
|
placeholders=self.phase_env,
|
||||||
|
memory=chat_env.memory,
|
||||||
model_type=self.model_type)
|
model_type=self.model_type)
|
||||||
chat_env = self.update_chat_env(chat_env)
|
chat_env = self.update_chat_env(chat_env)
|
||||||
return chat_env
|
return chat_env
|
||||||
@ -579,6 +584,7 @@ class TestErrorSummary(Phase):
|
|||||||
phase_name=self.phase_name,
|
phase_name=self.phase_name,
|
||||||
assistant_role_prompt=self.assistant_role_prompt,
|
assistant_role_prompt=self.assistant_role_prompt,
|
||||||
user_role_prompt=self.user_role_prompt,
|
user_role_prompt=self.user_role_prompt,
|
||||||
|
memory=chat_env.memory,
|
||||||
chat_turn_limit=chat_turn_limit,
|
chat_turn_limit=chat_turn_limit,
|
||||||
placeholders=self.phase_env)
|
placeholders=self.phase_env)
|
||||||
chat_env = self.update_chat_env(chat_env)
|
chat_env = self.update_chat_env(chat_env)
|
||||||
|
163
ecl/codes.py
Normal file
163
ecl/codes.py
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
import difflib
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import shutil
|
||||||
|
import time
|
||||||
|
import signal
|
||||||
|
from utils import get_easyDict_from_filepath
|
||||||
|
|
||||||
|
|
||||||
|
class Codes:
|
||||||
|
def __init__(self, generated_content=""):
|
||||||
|
cfg = get_easyDict_from_filepath("./ecl/config.yaml")
|
||||||
|
self.directory: str = cfg.codes.tmp_directory
|
||||||
|
self.main_script: str = cfg.codes.main_script
|
||||||
|
self.generated_content: str = generated_content
|
||||||
|
self.codebooks = {}
|
||||||
|
|
||||||
|
def extract_filename_from_line(lines):
|
||||||
|
file_name = ""
|
||||||
|
for candidate in re.finditer(r"(\w+\.\w+)", lines, re.DOTALL):
|
||||||
|
file_name = candidate.group()
|
||||||
|
file_name = file_name.lower()
|
||||||
|
return file_name
|
||||||
|
|
||||||
|
def extract_filename_from_code(code):
|
||||||
|
file_name = ""
|
||||||
|
regex_extract = r"class (\S+?):\n"
|
||||||
|
matches_extract = re.finditer(regex_extract, code, re.DOTALL)
|
||||||
|
for match_extract in matches_extract:
|
||||||
|
file_name = match_extract.group(1)
|
||||||
|
file_name = file_name.lower().split("(")[0] + ".py"
|
||||||
|
return file_name
|
||||||
|
|
||||||
|
if generated_content != "":
|
||||||
|
regex = r"(.+?)\n```.*?\n(.*?)```"
|
||||||
|
matches = re.finditer(regex, self.generated_content, re.DOTALL)
|
||||||
|
for match in matches:
|
||||||
|
code = match.group(2)
|
||||||
|
if "CODE" in code:
|
||||||
|
continue
|
||||||
|
group1 = match.group(1)
|
||||||
|
filename = extract_filename_from_line(group1)
|
||||||
|
if "__main__" in code:
|
||||||
|
filename = "main.py"
|
||||||
|
if filename == "": # post-processing
|
||||||
|
filename = extract_filename_from_code(code)
|
||||||
|
assert filename != ""
|
||||||
|
if filename is not None and code is not None and len(filename) > 0 and len(code) > 0:
|
||||||
|
self.codebooks[filename] = self._format_code(code)
|
||||||
|
|
||||||
|
def _format_code(self, code):
|
||||||
|
code = "\n".join([line for line in code.split("\n") if len(line.strip()) > 0])
|
||||||
|
return code
|
||||||
|
|
||||||
|
def _update_codes(self, generated_content):
|
||||||
|
new_codes = Codes(generated_content)
|
||||||
|
differ = difflib.Differ()
|
||||||
|
for key in new_codes.codebooks.keys():
|
||||||
|
if key not in self.codebooks.keys() or self.codebooks[key] != new_codes.codebooks[key]:
|
||||||
|
update_codes_content = "**[Update Codes]**\n\n"
|
||||||
|
update_codes_content += "{} updated.\n".format(key)
|
||||||
|
old_codes_content = self.codebooks[key] if key in self.codebooks.keys() else "# None"
|
||||||
|
new_codes_content = new_codes.codebooks[key]
|
||||||
|
|
||||||
|
lines_old = old_codes_content.splitlines()
|
||||||
|
lines_new = new_codes_content.splitlines()
|
||||||
|
|
||||||
|
unified_diff = difflib.unified_diff(lines_old, lines_new, lineterm='', fromfile='Old', tofile='New')
|
||||||
|
unified_diff = '\n'.join(unified_diff)
|
||||||
|
update_codes_content = update_codes_content + "\n\n" + """```
|
||||||
|
'''
|
||||||
|
|
||||||
|
'''\n""" + unified_diff + "\n```"
|
||||||
|
|
||||||
|
self.codebooks[key] = new_codes.codebooks[key]
|
||||||
|
|
||||||
|
def _rewrite_codes(self) -> None:
|
||||||
|
directory = self.directory
|
||||||
|
rewrite_codes_content = "**[Rewrite Codes]**\n"
|
||||||
|
if os.path.exists(directory):
|
||||||
|
shutil.rmtree(self.directory)
|
||||||
|
if not os.path.exists(directory):
|
||||||
|
os.mkdir(self.directory)
|
||||||
|
rewrite_codes_content += "{} Created\n".format(directory)
|
||||||
|
|
||||||
|
for filename in self.codebooks.keys():
|
||||||
|
filepath = os.path.join(directory, filename)
|
||||||
|
with open(filepath, "w", encoding="utf-8") as writer:
|
||||||
|
writer.write(self.codebooks[filename])
|
||||||
|
rewrite_codes_content += os.path.join(directory, filename) + " Wrote\n"
|
||||||
|
# print(rewrite_codes_content)
|
||||||
|
|
||||||
|
def _run_codes(self) -> None:
|
||||||
|
directory = os.path.abspath(self.directory)
|
||||||
|
if self.main_script not in os.listdir(directory):
|
||||||
|
return False, "{} Not Found".format(self.main_script)
|
||||||
|
|
||||||
|
success_info = "The software run successfully without errors."
|
||||||
|
|
||||||
|
try:
|
||||||
|
# check if we are on windows or linux
|
||||||
|
if os.name == 'nt':
|
||||||
|
command = "cd {} && dir && python {}".format(directory, self.main_script)
|
||||||
|
process = subprocess.Popen(
|
||||||
|
command,
|
||||||
|
shell=True,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
command = "cd {}; ls -l; python3 {};".format(directory, self.main_script)
|
||||||
|
process = subprocess.Popen(command,
|
||||||
|
shell=True,
|
||||||
|
preexec_fn=os.setsid,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE
|
||||||
|
)
|
||||||
|
time.sleep(3)
|
||||||
|
return_code = process.returncode
|
||||||
|
# Check if the software is still running
|
||||||
|
if process.poll() is None:
|
||||||
|
if "killpg" in dir(os):
|
||||||
|
os.killpg(os.getpgid(process.pid), signal.SIGTERM)
|
||||||
|
else:
|
||||||
|
os.kill(process.pid, signal.SIGTERM)
|
||||||
|
if process.poll() is None:
|
||||||
|
os.kill(process.pid, signal.CTRL_BREAK_EVENT)
|
||||||
|
|
||||||
|
if return_code == 0:
|
||||||
|
return False, success_info
|
||||||
|
else:
|
||||||
|
error_output = process.stderr.read().decode('utf-8')
|
||||||
|
if error_output:
|
||||||
|
if "Traceback".lower() in error_output.lower():
|
||||||
|
errs = error_output.replace(directory + "/", "")
|
||||||
|
return True, errs
|
||||||
|
else:
|
||||||
|
return False, success_info
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
return True, f"Error: {e}"
|
||||||
|
except Exception as ex:
|
||||||
|
return True, f"An error occurred: {ex}"
|
||||||
|
|
||||||
|
return False, success_info
|
||||||
|
|
||||||
|
def _get_codes(self) -> str:
|
||||||
|
content = ""
|
||||||
|
for filename in self.codebooks.keys():
|
||||||
|
content += "{}\n```{}\n{}\n```\n\n".format(filename,
|
||||||
|
"python" if filename.endswith(".py") else filename.split(".")[
|
||||||
|
-1], self.codebooks[filename])
|
||||||
|
return content
|
||||||
|
|
||||||
|
def _load_from_hardware(self, directory) -> None:
|
||||||
|
assert len([filename for filename in os.listdir(directory) if filename.endswith(".py")]) > 0
|
||||||
|
for root, directories, filenames in os.walk(directory):
|
||||||
|
for filename in filenames:
|
||||||
|
if filename.endswith(".py"):
|
||||||
|
code = open(os.path.join(directory, filename), "r", encoding="utf-8").read()
|
||||||
|
self.codebooks[filename] = self._format_code(code)
|
||||||
|
print("{} files read from {}".format(len(self.codebooks.keys()), directory))
|
17
ecl/config.yaml
Normal file
17
ecl/config.yaml
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
experience:
|
||||||
|
reap_zombie: True
|
||||||
|
threshold: 0
|
||||||
|
upper_limit: 10
|
||||||
|
|
||||||
|
codes:
|
||||||
|
tmp_directory: "tmp_codes"
|
||||||
|
main_script: "main.py"
|
||||||
|
|
||||||
|
embedding_method: "OpenAI"
|
||||||
|
|
||||||
|
retrieval:
|
||||||
|
top_k_code: 1 # top k target code
|
||||||
|
top_k_text: 1 # top k instructionstar
|
||||||
|
|
||||||
|
searchcode_thresh: 0 # similarity threshold between text query and instructionstar, search for targetcode
|
||||||
|
searchtext_thresh: 0 # similarity threshold between code query and sourcecode, search for instructionstar
|
69
ecl/ecl.py
Normal file
69
ecl/ecl.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
|
||||||
|
import argparse
|
||||||
|
from graph import Graph
|
||||||
|
from experience import Experience
|
||||||
|
from utils import get_easyDict_from_filepath,now ,log_and_print_online
|
||||||
|
from memory import Memory
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
sys.path.append(os.path.join(os.getcwd(),"ecl"))
|
||||||
|
|
||||||
|
|
||||||
|
def memorize(directory):
|
||||||
|
print(directory)
|
||||||
|
cfg = get_easyDict_from_filepath("./ecl/config.yaml")
|
||||||
|
|
||||||
|
folder_path = "ecl/logs"
|
||||||
|
if not os.path.exists(folder_path):
|
||||||
|
os.mkdir(folder_path)
|
||||||
|
log_filename = folder_path+"/ecl_{}.log".format(os.path.basename(directory))
|
||||||
|
print(log_filename)
|
||||||
|
root_logger = logging.getLogger()
|
||||||
|
for handler in root_logger.handlers[:]:
|
||||||
|
root_logger.removeHandler(handler)
|
||||||
|
file_handler = logging.FileHandler(log_filename, mode='w', encoding='utf-8')
|
||||||
|
formatter = logging.Formatter('[%(asctime)s %(levelname)s] %(message)s', datefmt='%Y-%d-%m %H:%M:%S')
|
||||||
|
file_handler.setFormatter(formatter)
|
||||||
|
root_logger.addHandler(file_handler)
|
||||||
|
root_logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
log_and_print_online("[Config]:"+str(cfg))
|
||||||
|
graph = Graph()
|
||||||
|
graph.create_from_log(directory)
|
||||||
|
graph.print()
|
||||||
|
|
||||||
|
experience = Experience(graph, directory)
|
||||||
|
if len(graph.nodes)==0 or len(graph.edges) == 0:
|
||||||
|
log_and_print_online("No node or no edges constrcuted from the task execution process, maybe due to a unfinished software production or sometimes single node appears")
|
||||||
|
else:
|
||||||
|
if cfg.experience.reap_zombie:
|
||||||
|
experience.reap_zombie()
|
||||||
|
graph.print()
|
||||||
|
experience.estimate()
|
||||||
|
experiences = experience.extract_thresholded_experiences()
|
||||||
|
|
||||||
|
# memory upload
|
||||||
|
memory = Memory()
|
||||||
|
memory.upload()
|
||||||
|
memory.upload_from_experience(experience)
|
||||||
|
|
||||||
|
def process_directory(directory):
|
||||||
|
for root, dirs, files in os.walk(directory):
|
||||||
|
for directory in dirs:
|
||||||
|
file_path = os.path.join(root, directory)
|
||||||
|
memorize(file_path)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Memorize one software or softwares from the directory.")
|
||||||
|
parser.add_argument("path", help="The file or directory to process")
|
||||||
|
parser.add_argument("-d", "--directory", action="store_true", help="Process all files in the given directory.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.directory:
|
||||||
|
process_directory(args.path)
|
||||||
|
else:
|
||||||
|
memorize(args.path)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
84
ecl/embedding.py
Normal file
84
ecl/embedding.py
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
import os
|
||||||
|
import openai
|
||||||
|
from openai import OpenAI
|
||||||
|
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
|
||||||
|
if 'BASE_URL' in os.environ:
|
||||||
|
BASE_URL = os.environ['BASE_URL']
|
||||||
|
else:
|
||||||
|
BASE_URL = None
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from tenacity import (
|
||||||
|
retry,
|
||||||
|
stop_after_attempt,
|
||||||
|
wait_random_exponential,
|
||||||
|
wait_fixed
|
||||||
|
)
|
||||||
|
from utils import log_and_print_online
|
||||||
|
sys.path.append(os.path.join(os.getcwd(),"ecl"))
|
||||||
|
|
||||||
|
class OpenAIEmbedding:
|
||||||
|
def __init__(self, **params):
|
||||||
|
self.code_prompt_tokens = 0
|
||||||
|
self.text_prompt_tokens = 0
|
||||||
|
self.code_total_tokens = 0
|
||||||
|
self.text_total_tokens = 0
|
||||||
|
|
||||||
|
self.prompt_tokens = 0
|
||||||
|
self.total_tokens = 0
|
||||||
|
|
||||||
|
@retry(wait=wait_random_exponential(min=2, max=5), stop=stop_after_attempt(10))
|
||||||
|
def get_text_embedding(self,text: str):
|
||||||
|
if BASE_URL:
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key=OPENAI_API_KEY,
|
||||||
|
base_url=BASE_URL,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key=OPENAI_API_KEY
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(text)>8191:
|
||||||
|
text = text[:8190]
|
||||||
|
response = client.embeddings.create(input = text, model="text-embedding-ada-002").model_dump()
|
||||||
|
embedding = response['data'][0]['embedding']
|
||||||
|
log_and_print_online(
|
||||||
|
"Get text embedding from {}:\n**[OpenAI_Usage_Info Receive]**\nprompt_tokens: {}\ntotal_tokens: {}\n".format(
|
||||||
|
response["model"],response["usage"]["prompt_tokens"],response["usage"]["total_tokens"]))
|
||||||
|
self.text_prompt_tokens += response["usage"]["prompt_tokens"]
|
||||||
|
self.text_total_tokens += response["usage"]["total_tokens"]
|
||||||
|
self.prompt_tokens += response["usage"]["prompt_tokens"]
|
||||||
|
self.total_tokens += response["usage"]["total_tokens"]
|
||||||
|
|
||||||
|
return embedding
|
||||||
|
|
||||||
|
@retry(wait=wait_random_exponential(min=10, max=60), stop=stop_after_attempt(10))
|
||||||
|
def get_code_embedding(self,code: str):
|
||||||
|
if BASE_URL:
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key=OPENAI_API_KEY,
|
||||||
|
base_url=BASE_URL,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key=OPENAI_API_KEY
|
||||||
|
)
|
||||||
|
if len(code) == 0:
|
||||||
|
code = "#"
|
||||||
|
elif len(code) >8191:
|
||||||
|
code = code[0:8190]
|
||||||
|
response = client.embeddings.create(input=code, model="text-embedding-ada-002").model_dump()
|
||||||
|
embedding = response['data'][0]['embedding']
|
||||||
|
log_and_print_online(
|
||||||
|
"Get code embedding from {}:\n**[OpenAI_Usage_Info Receive]**\nprompt_tokens: {}\ntotal_tokens: {}\n".format(
|
||||||
|
response["model"],response["usage"]["prompt_tokens"],response["usage"]["total_tokens"]))
|
||||||
|
|
||||||
|
self.code_prompt_tokens += response["usage"]["prompt_tokens"]
|
||||||
|
self.code_total_tokens += response["usage"]["total_tokens"]
|
||||||
|
self.prompt_tokens += response["usage"]["prompt_tokens"]
|
||||||
|
self.total_tokens += response["usage"]["total_tokens"]
|
||||||
|
|
||||||
|
return embedding
|
||||||
|
|
||||||
|
|
311
ecl/experience.py
Normal file
311
ecl/experience.py
Normal file
@ -0,0 +1,311 @@
|
|||||||
|
import os
|
||||||
|
import time
|
||||||
|
from graph import Graph, Node, Edge
|
||||||
|
import sys
|
||||||
|
import openai
|
||||||
|
import numpy as np
|
||||||
|
from codes import Codes
|
||||||
|
from utils import get_easyDict_from_filepath,OpenAIModel,log_and_print_online
|
||||||
|
from embedding import OpenAIEmbedding
|
||||||
|
sys.path.append(os.path.join(os.getcwd(),"ecl"))
|
||||||
|
class Shortcut:
|
||||||
|
def __init__(self, sourceMID, targetMID, valueGain,instructionStar,edgeIDPath):
|
||||||
|
self.sourceMID = sourceMID
|
||||||
|
self.targetMID = targetMID
|
||||||
|
self.valueGain = valueGain
|
||||||
|
self.embedding = None
|
||||||
|
self.instructionStar = instructionStar
|
||||||
|
self.edgeIDPath = edgeIDPath
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "{} -> {} valueGain={:.6f} len(instructionPath)={} instructionStar={}".format(self.sourceMID, self.targetMID, self.valueGain, len(self.edgeIDPath), self.instructionStar[:100].replace("\n", ""))
|
||||||
|
|
||||||
|
class Experience:
|
||||||
|
def __init__(self, graph: Graph, directory: str):
|
||||||
|
cfg = get_easyDict_from_filepath("./ecl/config.yaml")
|
||||||
|
self.graph: Graph = graph
|
||||||
|
self.directory = directory
|
||||||
|
self.threshold = cfg.experience.threshold
|
||||||
|
self.upperLimit = cfg.experience.upper_limit
|
||||||
|
self.experiences = []
|
||||||
|
|
||||||
|
self.model = OpenAIModel(model_type="gpt-3.5-turbo-16k")
|
||||||
|
self.embedding_method = OpenAIEmbedding()
|
||||||
|
|
||||||
|
for edge in self.graph.edges:
|
||||||
|
node = self.graph.nodes[edge.targetMID]
|
||||||
|
node.degree += 1
|
||||||
|
assert len(self.graph.edges) * 1 == sum([self.graph.nodes[mid].degree for mid in self.graph.nodes.keys()]) # unidirectional
|
||||||
|
|
||||||
|
for mid in self.graph.nodes.keys():
|
||||||
|
node = self.graph.nodes[mid]
|
||||||
|
node.value = 1.0
|
||||||
|
|
||||||
|
def reap_zombie(self):
|
||||||
|
|
||||||
|
pathNodes, pathEdges = self.graph.find_shortest_path()
|
||||||
|
|
||||||
|
zombieEdges = [edge for edge in self.graph.edges if edge not in pathEdges]
|
||||||
|
zombieNodes = [self.graph.nodes[mid] for mid in self.graph.nodes.keys() if mid not in pathNodes]
|
||||||
|
log_zombieedges = "ZOMBIE EDGES: \n"
|
||||||
|
log_zombienodes = "ZOMBIE NODES: \n"
|
||||||
|
for edge in zombieEdges:
|
||||||
|
self.graph.edges.remove(edge)
|
||||||
|
log_zombieedges += "Zombie Edge {} -> {} Removed\n".format(edge.sourceMID, edge.targetMID)
|
||||||
|
log_and_print_online(log_zombieedges)
|
||||||
|
|
||||||
|
for node in zombieNodes:
|
||||||
|
del self.graph.nodes[node.mID]
|
||||||
|
log_zombienodes += "Zombie Node {} Removed\n".format(node.mID)
|
||||||
|
log_and_print_online(log_zombienodes)
|
||||||
|
|
||||||
|
def estimate(self):
|
||||||
|
if len(self.graph.edges) == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
for mid in self.graph.nodes.keys():
|
||||||
|
node = self.graph.nodes[mid]
|
||||||
|
if len(node.code) == 0:
|
||||||
|
node.value *= 0.0
|
||||||
|
|
||||||
|
log_and_print_online()
|
||||||
|
|
||||||
|
vn = self.graph.nodes[self.graph.edges[-1].targetMID]
|
||||||
|
# print(vn.mID, "...")
|
||||||
|
|
||||||
|
for mid in self.graph.nodes.keys():
|
||||||
|
# print(mid)
|
||||||
|
vi = self.graph.nodes[mid]
|
||||||
|
vi.value = self._pairwise_estimate(vi, vn)
|
||||||
|
|
||||||
|
log_and_print_online("Init value:"+ str({mid: self.graph.nodes[mid].value for mid in self.graph.nodes.keys()})+"\n\nEstimated value:"+str({mid: self.graph.nodes[mid].value for mid in self.graph.nodes.keys()}))
|
||||||
|
|
||||||
|
def get_cosine_similarity(self, embeddingi, embeddingj):
|
||||||
|
embeddingi = np.array(embeddingi)
|
||||||
|
embeddingj = np.array(embeddingj)
|
||||||
|
cos_sim = embeddingi.dot(embeddingj) / (np.linalg.norm(embeddingi) * np.linalg.norm(embeddingj))
|
||||||
|
return cos_sim
|
||||||
|
|
||||||
|
def _pairwise_estimate(self, vi: Node, vj: Node):
|
||||||
|
|
||||||
|
if vi.value == 0.0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
pathNodes, pathEdges = self.graph.find_shortest_path(vi.mID, vj.mID)
|
||||||
|
distance_weight = 1.0 / len(pathEdges) if len(pathEdges) != 0 else 1.0
|
||||||
|
|
||||||
|
codes = Codes(vi.code)
|
||||||
|
codes._rewrite_codes()
|
||||||
|
(exist_bugs_flag, test_reports) = codes._run_codes()
|
||||||
|
compile_weight = 0.0 if exist_bugs_flag else 1.0
|
||||||
|
|
||||||
|
if compile_weight == 0.0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
maximum_degree = max([self.graph.nodes[mid].degree for mid in self.graph.nodes.keys()])
|
||||||
|
degree_weight = vi.degree * 1.0 / maximum_degree
|
||||||
|
|
||||||
|
if degree_weight == 0.0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
vi_code_emb = self.embedding_method.get_code_embedding(vi.code) if vi.embedding is None else vi.embedding
|
||||||
|
if vi.embedding is None:
|
||||||
|
end_time =time.time()
|
||||||
|
log_and_print_online("DONE:get node embedding\ntime cost:{}\n".format(end_time-start_time))
|
||||||
|
vi.embedding = vi_code_emb
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
vj_code_emb = self.embedding_method.get_code_embedding(vj.code) if vj.embedding is None else vj.embedding
|
||||||
|
if vj.embedding is None:
|
||||||
|
end_time =time.time()
|
||||||
|
log_and_print_online("DONE:get node embedding\ntime cost:{}\n".format(end_time-start_time))
|
||||||
|
vj.embedding = vj_code_emb
|
||||||
|
code_code_cos_sim = self.get_cosine_similarity(vi_code_emb, vj_code_emb)
|
||||||
|
|
||||||
|
if code_code_cos_sim == 0.0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
filenames = os.listdir(self.directory)
|
||||||
|
filename = [filename for filename in filenames if filename.endswith(".prompt")][0]
|
||||||
|
task_prompt = open(os.path.join(self.directory, filename), "r").read().strip()
|
||||||
|
start_time = time.time()
|
||||||
|
task_emb = self.embedding_method.get_text_embedding(task_prompt) if self.graph.task_embedding is None else self.graph.task_embedding
|
||||||
|
if self.graph.task_embedding is None:
|
||||||
|
end_time =time.time()
|
||||||
|
log_and_print_online("DONE:get task prompt embedding\ntime cost:{}\n".format(end_time-start_time))
|
||||||
|
self.graph.task = task_prompt
|
||||||
|
self.graph.task_embedding = task_emb
|
||||||
|
code_text_cos_sim = self.get_cosine_similarity(vi_code_emb, task_emb)
|
||||||
|
|
||||||
|
if code_text_cos_sim == 0.0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
assert distance_weight >= 0.0 and distance_weight <= 1.0
|
||||||
|
assert compile_weight >= 0.0 and compile_weight <= 1.0
|
||||||
|
assert degree_weight >= 0.0 and degree_weight <= 1.0
|
||||||
|
|
||||||
|
distance = vj.version - vi.version
|
||||||
|
|
||||||
|
if distance == 0:
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return code_code_cos_sim * 1.0 / distance * code_text_cos_sim * compile_weight * degree_weight
|
||||||
|
#return distance_weight * compile_weight * degree_weight
|
||||||
|
|
||||||
|
def get_transitive_closure(self):
|
||||||
|
def print_matrix(matrix):
|
||||||
|
for nodei in matrix.keys():
|
||||||
|
for nodej in matrix.keys():
|
||||||
|
print(matrix[nodei][nodej], end=" ")
|
||||||
|
print()
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Warshall Algorithm
|
||||||
|
matrix = {}
|
||||||
|
for mid1 in self.graph.nodes:
|
||||||
|
for mid2 in self.graph.nodes:
|
||||||
|
if mid1 not in matrix.keys():
|
||||||
|
matrix[mid1] = {}
|
||||||
|
matrix[mid1][mid2] = 0
|
||||||
|
# print_matrix(matrix)
|
||||||
|
|
||||||
|
pathNodes, pathEdges = self.graph.find_shortest_path()
|
||||||
|
for edge in pathEdges:
|
||||||
|
matrix[edge.sourceMID][edge.targetMID] = 1
|
||||||
|
print("Init Adjacent Matrix:")
|
||||||
|
print_matrix(matrix)
|
||||||
|
|
||||||
|
for nodek in matrix.keys():
|
||||||
|
for nodei in matrix.keys():
|
||||||
|
for nodej in matrix.keys():
|
||||||
|
if matrix[nodei][nodej] == 1 or (matrix[nodei][nodek] == 1 and matrix[nodek][nodej] == 1):
|
||||||
|
matrix[nodei][nodej] = 1
|
||||||
|
print("Transitive Closure:")
|
||||||
|
print_matrix(matrix)
|
||||||
|
|
||||||
|
return matrix
|
||||||
|
|
||||||
|
def extract_thresholded_experiences(self):
|
||||||
|
if len(self.graph.edges) == 0:
|
||||||
|
return []
|
||||||
|
if len(self.graph.nodes) < 2:
|
||||||
|
return []
|
||||||
|
assert len(self.graph.nodes.keys()) >= 2
|
||||||
|
matrix = self.get_transitive_closure()
|
||||||
|
|
||||||
|
experiences = []
|
||||||
|
pathNodes, _ = self.graph.find_shortest_path()
|
||||||
|
for id1 in pathNodes:
|
||||||
|
for id2 in pathNodes:
|
||||||
|
valueGain = self.graph.nodes[id2].value - self.graph.nodes[id1].value
|
||||||
|
flag0 = id1 != id2
|
||||||
|
flag1 = self.graph.exists_edge(id1, id2) == False
|
||||||
|
flag2 = matrix[id1][id2] == 1
|
||||||
|
flag3 = valueGain >= self.threshold
|
||||||
|
|
||||||
|
code_lines = [line.lower().strip() for line in self.graph.nodes[id2].code.split("\n")]
|
||||||
|
flag4 = not ("pass".lower() in code_lines or "TODO".lower() in code_lines)
|
||||||
|
|
||||||
|
if flag0 and flag1 and flag2 and flag3 and flag4:
|
||||||
|
_, edges = self.graph.find_shortest_path(uMID=id1, vMID=id2)
|
||||||
|
edgeIDPath = [edge.edgeId for edge in edges]
|
||||||
|
sourcecode=self.graph.nodes[id1].code
|
||||||
|
targetcode=self.graph.nodes[id2].code
|
||||||
|
shortcut = Shortcut(sourceMID=id1, targetMID=id2, valueGain=valueGain,instructionStar="", edgeIDPath=edgeIDPath)
|
||||||
|
experiences.append(shortcut)
|
||||||
|
|
||||||
|
experiences = sorted(experiences, key=lambda item: item.valueGain, reverse = True)
|
||||||
|
|
||||||
|
if len(experiences) > self.upperLimit:
|
||||||
|
log_and_print_online("{} experieces truncated.".format(len(experiences) - self.upperLimit))
|
||||||
|
experiences = experiences[:self.upperLimit]
|
||||||
|
|
||||||
|
prompt_template0 = """Provide detailed instructions to generate the following code:
|
||||||
|
{targetcode}
|
||||||
|
|
||||||
|
The instructions should encompass:
|
||||||
|
|
||||||
|
Modules and Classes:
|
||||||
|
- Enumerate necessary modules.
|
||||||
|
- Detail the classes, their attributes, and methods within these modules.
|
||||||
|
- Articulate the purpose and operation of each class.
|
||||||
|
|
||||||
|
Data Structures:
|
||||||
|
- Identify the requisite data structures.
|
||||||
|
- Describe their names, attributes, and operations.
|
||||||
|
|
||||||
|
Main Program Flow:
|
||||||
|
- Outline the principal progression of the program.
|
||||||
|
- Highlight the sequence for initializing and invoking other modules, classes, and methods within the primary file (e.g., main.py).
|
||||||
|
- Clarify the logical progression during runtime.
|
||||||
|
|
||||||
|
Input and Output:
|
||||||
|
- Specify the method by which the program accepts input, be it from users or external sources.
|
||||||
|
- Elaborate on the projected outputs or actions of the software.
|
||||||
|
|
||||||
|
Exception Handling:
|
||||||
|
- Instruct on the approach to manage potential anomalies or exceptions during execution to ascertain stability and robustness.
|
||||||
|
|
||||||
|
External Libraries and Dependencies:
|
||||||
|
- Explicitly list the necessary external libraries or dependencies, their versions, and their functionalities.
|
||||||
|
|
||||||
|
Please output the instructions directly."""
|
||||||
|
|
||||||
|
prompt_template1 = """Please provide detailed instructions on how to transition from the initial code version represented by source code to the final version indicated by target code.
|
||||||
|
|
||||||
|
Source Code:
|
||||||
|
{sourcecode}
|
||||||
|
|
||||||
|
Target Code:
|
||||||
|
{targetcode}
|
||||||
|
|
||||||
|
The instructions should encompass:
|
||||||
|
|
||||||
|
Modules and Classes: Detail the modules to be incorporated, along with the names, attributes, and operations of any classes to be added or amended. Furthermore, describe the intended function and utility of these new or altered classes.
|
||||||
|
|
||||||
|
Data Structures: Clearly define any data structures that need introduction or alteration, elucidating their names, attributes, and functionalities.
|
||||||
|
|
||||||
|
Main Program Flow: Outline the program's primary sequence of operations, highlighting the procedures to initialize and invoke other modules, classes, and methods in the primary file (e.g., main.py). Describe the program's logic sequence during its execution.
|
||||||
|
|
||||||
|
Input and Output: Define the methodology by which the program will acquire input, whether from users or external data sources. Also, characterize the projected outputs or behaviors of the application.
|
||||||
|
|
||||||
|
Exception Handling: Provide guidance on managing potential discrepancies or exceptions that might emerge during the software's operation, ensuring its resilience and reliability.
|
||||||
|
|
||||||
|
External Libraries and Dependencies: If the implementation requires external libraries or dependencies, specify their names, versions, and their respective purposes explicitly."""
|
||||||
|
|
||||||
|
|
||||||
|
for shortcut in experiences:
|
||||||
|
sourcecode = self.graph.nodes[shortcut.sourceMID].code
|
||||||
|
targetcode = self.graph.nodes[shortcut.targetMID].code
|
||||||
|
if sourcecode == "":
|
||||||
|
prompt = prompt_template0.replace("{targetcode}", targetcode)
|
||||||
|
response = self.model.run(messages=[{"role": "system", "content": prompt}])
|
||||||
|
print("instructionstar generated")
|
||||||
|
else:
|
||||||
|
prompt = prompt_template1.replace("{sourcecode}", sourcecode).replace("{targetcode}", targetcode)
|
||||||
|
response = self.model.run(messages=[{"role": "system", "content": prompt}])
|
||||||
|
print("instructionstar generated")
|
||||||
|
shortcut.instructionStar = response["choices"][0]["message"]["content"]
|
||||||
|
output = "Sorted-and-Truncated Experiences (with instructionStar):"
|
||||||
|
|
||||||
|
self.experiences = experiences
|
||||||
|
for experience in experiences:
|
||||||
|
output += str(experience)
|
||||||
|
log_and_print_online(output)
|
||||||
|
log_and_print_online("[Conclusion]:\nprompt_tokens:{}, completion_tokens:{}, total_tokens:{}".format(self.model.prompt_tokens,self.model.completion_tokens,self.model.total_tokens))
|
||||||
|
log_and_print_online("[Conclusion]:\ntext_prompt_tokens:{}, text_total_tokens:{}\ncode_prompt_tokens:{}, code_total_tokens:{}\nprompt_tokens:{}, total_tokens:{}".format(self.embedding_method.text_prompt_tokens,
|
||||||
|
self.embedding_method.text_total_tokens,
|
||||||
|
self.embedding_method.code_prompt_tokens,
|
||||||
|
self.embedding_method.code_total_tokens,
|
||||||
|
self.embedding_method.prompt_tokens,
|
||||||
|
self.embedding_method.total_tokens))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return experiences
|
||||||
|
def to_dict(self):
|
||||||
|
merged_data = []
|
||||||
|
for index, ex in enumerate(self.experiences):
|
||||||
|
merged_data.append(ex.__dict__)
|
||||||
|
return merged_data
|
327
ecl/graph.py
Normal file
327
ecl/graph.py
Normal file
@ -0,0 +1,327 @@
|
|||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import hashlib
|
||||||
|
from queue import Queue
|
||||||
|
import re
|
||||||
|
from utils import cmd,log_and_print_online
|
||||||
|
|
||||||
|
class Node:
|
||||||
|
def __init__(self):
|
||||||
|
self.code = None
|
||||||
|
self.version = None
|
||||||
|
self.commitMessage = None
|
||||||
|
self.mID = None
|
||||||
|
self.role = None
|
||||||
|
self.degree = 0
|
||||||
|
self.value = 0.0
|
||||||
|
self.embedding = None
|
||||||
|
|
||||||
|
def create_from_warehouse(self, directory) -> None:
|
||||||
|
def _format_code(code):
|
||||||
|
code = "\n".join([line for line in code.split("\n") if len(line.strip()) > 0])
|
||||||
|
return code
|
||||||
|
|
||||||
|
# Read all .py files
|
||||||
|
codebooks = {}
|
||||||
|
assert len([filename for filename in os.listdir(directory) if filename.endswith(".py")]) > 0
|
||||||
|
for root, directories, filenames in os.walk(directory):
|
||||||
|
for filename in filenames:
|
||||||
|
if filename.endswith(".py"):
|
||||||
|
codebooks[filename] = _format_code(open(os.path.join(directory, filename), "r", encoding="utf-8").read())
|
||||||
|
|
||||||
|
# Format Codes
|
||||||
|
code = ""
|
||||||
|
for filename in codebooks.keys():
|
||||||
|
filepath = os.path.join(directory, filename)
|
||||||
|
code += "{}\n```Python\n{}\n```\n\n".format(filename, codebooks[filename])
|
||||||
|
|
||||||
|
self.code = code
|
||||||
|
self.mID = hashlib.md5(self.code.encode(encoding='UTF-8')).hexdigest()
|
||||||
|
|
||||||
|
content = cmd("cd {} && git log --oneline".format(directory)).replace("(HEAD -> main)", "").replace(" ", " ")
|
||||||
|
self.commitMessage = " ".join(content.split("\n")[0].split(" ")[1:])
|
||||||
|
self.version = float(content.split("\n")[0].split(" ")[1].replace("v", ""))
|
||||||
|
|
||||||
|
class Edge:
|
||||||
|
def __init__(self, sourceMID, targetMID, instruction, role):
|
||||||
|
self.sourceMID = sourceMID
|
||||||
|
self.targetMID = targetMID
|
||||||
|
self.instruction = instruction
|
||||||
|
self.role = role
|
||||||
|
self.edgeId = None
|
||||||
|
self.embedding = None
|
||||||
|
|
||||||
|
class Graph:
|
||||||
|
def __init__(self):
|
||||||
|
self.task = ""
|
||||||
|
self.task_embedding = None
|
||||||
|
self.nodes = {}
|
||||||
|
self.edges = []
|
||||||
|
self.directory:str = None
|
||||||
|
|
||||||
|
def addNode(self, node: Node):
|
||||||
|
if node.mID not in self.nodes.keys():
|
||||||
|
self.nodes[node.mID] = node
|
||||||
|
|
||||||
|
def addEdge(self, edge: Edge):
|
||||||
|
num = "edge_{}".format(len(self.edges))
|
||||||
|
edge.edgeId = hashlib.md5(num.encode(encoding='UTF-8')).hexdigest()
|
||||||
|
self.edges.append(edge)
|
||||||
|
|
||||||
|
def exists_edge(self, mid1: str, mid2: str):
|
||||||
|
for edge in self.edges:
|
||||||
|
if edge.sourceMID == mid1 and edge.targetMID == mid2:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def create_from_warehouse(self, directory) -> None:
|
||||||
|
self.directory = directory
|
||||||
|
content = cmd("cd {} && git log --oneline".format(directory))
|
||||||
|
#assert "log commit" in content
|
||||||
|
cIDs = ["0" * 7] + [line.split(" ")[0] for line in content.split("\n") if len(line)>0][::-1] # Commit IDs
|
||||||
|
log_cID = cIDs[-1]
|
||||||
|
cIDs = cIDs[:-1]
|
||||||
|
log_and_print_online("commit history:"+ str(cIDs)+ "\nlog commit:"+ str(log_cID))
|
||||||
|
|
||||||
|
# Commit ID -> md5 ID
|
||||||
|
# Constructing Nodes
|
||||||
|
try:
|
||||||
|
cID2mID = {}
|
||||||
|
output = ""
|
||||||
|
for cID in cIDs:
|
||||||
|
if cID == "0" * 7:
|
||||||
|
node = Node()
|
||||||
|
node.code = ""
|
||||||
|
node.mID = hashlib.md5("".encode(encoding='UTF-8')).hexdigest()
|
||||||
|
node.commitMessage = ""
|
||||||
|
node.version = "v0.0"
|
||||||
|
cID2mID[cID] = node.mID
|
||||||
|
self.addNode(node)
|
||||||
|
output += ("Node: {} -> {}\n".format("0" * 7, node.mID))
|
||||||
|
else:
|
||||||
|
content = cmd("cd {} && git reset --hard {}".format(directory, cID))
|
||||||
|
node = Node()
|
||||||
|
node.create_from_warehouse(directory)
|
||||||
|
cID2mID[cID] = node.mID
|
||||||
|
self.addNode(node)
|
||||||
|
output += ("Node: {} -> {}\n".format(cID, node.mID))
|
||||||
|
finally:
|
||||||
|
cmd("cd {} && git reset --hard {}".format(directory, log_cID))
|
||||||
|
log_and_print_online(output)
|
||||||
|
# Constructing Edges
|
||||||
|
for i in range(1, len(cIDs), 1):
|
||||||
|
sourceCID = cIDs[i-1]
|
||||||
|
targetCID = cIDs[i]
|
||||||
|
sourceMID = cID2mID[sourceCID]
|
||||||
|
targetMID = cID2mID[targetCID]
|
||||||
|
edge = Edge(sourceMID, targetMID, instruction="", role="")
|
||||||
|
self.addEdge(edge)
|
||||||
|
# print("{} -> {}, {} -> {}".format(sourcecID, targetcID, sourcemID, targetmID))
|
||||||
|
self._create_instruction_and_roles_from_log(directory)
|
||||||
|
|
||||||
|
def create_from_log(self, directory) -> None:
|
||||||
|
|
||||||
|
def update_codebook(utterance, codebook):
|
||||||
|
def extract_filename_from_line(lines):
|
||||||
|
file_name = ""
|
||||||
|
for candidate in re.finditer(r"(\w+\.\w+)", lines, re.DOTALL):
|
||||||
|
file_name = candidate.group()
|
||||||
|
file_name = file_name.lower()
|
||||||
|
return file_name
|
||||||
|
|
||||||
|
def extract_filename_from_code(code):
|
||||||
|
file_name = ""
|
||||||
|
regex_extract = r"class (\S+?):\n"
|
||||||
|
matches_extract = re.finditer(regex_extract, code, re.DOTALL)
|
||||||
|
for match_extract in matches_extract:
|
||||||
|
file_name = match_extract.group(1)
|
||||||
|
file_name = file_name.lower().split("(")[0] + ".py"
|
||||||
|
return file_name
|
||||||
|
|
||||||
|
def _format_code(code):
|
||||||
|
code = "\n".join([line for line in code.split("\n") if len(line.strip()) > 0])
|
||||||
|
return code
|
||||||
|
|
||||||
|
regex = r"(.+?)\n```.*?\n(.*?)```"
|
||||||
|
matches = re.finditer(regex, utterance, re.DOTALL)
|
||||||
|
for match in matches:
|
||||||
|
code = match.group(2)
|
||||||
|
if "CODE" in code:
|
||||||
|
continue
|
||||||
|
group1 = match.group(1)
|
||||||
|
filename = extract_filename_from_line(group1)
|
||||||
|
if "__main__" in code:
|
||||||
|
filename = "main.py"
|
||||||
|
if filename == "":
|
||||||
|
filename = extract_filename_from_code(code)
|
||||||
|
assert filename != ""
|
||||||
|
if filename is not None and code is not None and len(filename) > 0 and len(code) > 0:
|
||||||
|
codebook[filename] = _format_code(code)
|
||||||
|
|
||||||
|
def get_codes(codebook):
|
||||||
|
content = ""
|
||||||
|
for filename in codebook.keys():
|
||||||
|
content += "{}\n```{}\n{}\n```\n\n".format(filename, "python" if filename.endswith(".py") else
|
||||||
|
filename.split(".")[-1], codebook[filename])
|
||||||
|
return content
|
||||||
|
|
||||||
|
self.directory = directory
|
||||||
|
logdir = [filename for filename in os.listdir(directory) if filename.endswith(".log")]
|
||||||
|
if len(logdir) > 0:
|
||||||
|
log_filename = logdir[0]
|
||||||
|
print("log_filename:", log_filename)
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
content = open(os.path.join(directory, log_filename), "r", encoding='UTF-8').read()
|
||||||
|
|
||||||
|
utterances = []
|
||||||
|
regex = r"\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} \w+)\] ([.\s\S\n\r\d\D\t]*?)(?=\n\[\d|$)"
|
||||||
|
matches = re.finditer(regex, content, re.DOTALL)
|
||||||
|
for match in matches:
|
||||||
|
group1 = match.group(1)
|
||||||
|
group2 = match.group(2)
|
||||||
|
utterances.append("[{}] {}".format(group1, group2))
|
||||||
|
utterances = [utterance for utterance in utterances if
|
||||||
|
"flask app.py" not in utterance and "OpenAI_Usage_Info" not in utterance]
|
||||||
|
index = [i for i, utterance in enumerate(utterances) if
|
||||||
|
"Programmer<->Chief Technology Officer on : EnvironmentDoc" in utterance]
|
||||||
|
if len(index) > 0:
|
||||||
|
utterances = utterances[:index[0] - 1]
|
||||||
|
|
||||||
|
utterances_code= [utterance for utterance in utterances if
|
||||||
|
"Programmer<->" in utterance and "EnvironmentDoc" not in utterance and "TestErrorSummary" not in utterance]
|
||||||
|
print("len(utterances_code):", len(utterances_code))
|
||||||
|
|
||||||
|
codebook, fingerprints, pre_mid = {}, set(), ""
|
||||||
|
for utterance in utterances_code:
|
||||||
|
update_codebook(utterance, codebook)
|
||||||
|
|
||||||
|
# construct node
|
||||||
|
node = Node()
|
||||||
|
node.mID = hashlib.md5(get_codes(codebook).encode(encoding='UTF-8')).hexdigest()
|
||||||
|
node.commitMessage = ""
|
||||||
|
node.code = get_codes(codebook)
|
||||||
|
node.version = float(len(fingerprints))
|
||||||
|
if node.mID not in fingerprints:
|
||||||
|
fingerprints.add(node.mID)
|
||||||
|
self.addNode(node)
|
||||||
|
|
||||||
|
# construct edge
|
||||||
|
if pre_mid != "":
|
||||||
|
sourceMID = pre_mid
|
||||||
|
targetMID = node.mID
|
||||||
|
edge = Edge(sourceMID, targetMID, instruction="", role="")
|
||||||
|
self.addEdge(edge)
|
||||||
|
pre_mid = node.mID
|
||||||
|
|
||||||
|
self._create_instruction_and_roles_from_log(directory)
|
||||||
|
|
||||||
|
def _create_instruction_and_roles_from_log(self, directory) -> None:
|
||||||
|
logdir = [filename for filename in os.listdir(directory) if filename.endswith(".log")]
|
||||||
|
if len(logdir)>0:
|
||||||
|
log_filename = logdir[0]
|
||||||
|
log_and_print_online("log_filename:"+log_filename)
|
||||||
|
else :
|
||||||
|
return
|
||||||
|
content = open(os.path.join(directory, log_filename), "r", encoding='UTF-8').read()
|
||||||
|
|
||||||
|
utterances = []
|
||||||
|
regex = r"\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} \w+)\] ([.\s\S\n\r\d\D\t]*?)(?=\n\[\d|$)"
|
||||||
|
matches = re.finditer(regex, content, re.DOTALL)
|
||||||
|
for match in matches:
|
||||||
|
group1 = match.group(1)
|
||||||
|
group2 = match.group(2)
|
||||||
|
# print(group1)
|
||||||
|
# print(group2)
|
||||||
|
utterances.append(group2)
|
||||||
|
# print()
|
||||||
|
utterances = [utterance for utterance in utterances if "Chief Technology Officer: **[Start Chat]**" in utterance or "Code Reviewer: **[Start Chat]**" in utterance or "Software Test Engineer: **[Start Chat]**" in utterance]
|
||||||
|
if "Test Pass!" in content:
|
||||||
|
utterances.append("Software Test Engineer: **[Start Chat]**\n\nTest Pass!")
|
||||||
|
|
||||||
|
instructions, roles = [], []
|
||||||
|
for utterance in utterances:
|
||||||
|
utterance = utterance.lower()
|
||||||
|
instruction = ""
|
||||||
|
if "Chief Technology Officer: **[Start Chat]**".lower() in utterance:
|
||||||
|
instruction = "write one or multiple files and make sure that every detail of the architecture is implemented as code"
|
||||||
|
elif "Code Reviewer: **[Start Chat]**".lower() in utterance:
|
||||||
|
instruction = utterance.split("Comments on Codes:".lower())[-1].split("In the software,".lower())[0]
|
||||||
|
instruction = instruction.replace("<comment>".lower(), "")
|
||||||
|
elif "Software Test Engineer: **[Start Chat]**".lower() in utterance:
|
||||||
|
if "Test Pass!".lower() in utterance:
|
||||||
|
instruction = "Test Pass!"
|
||||||
|
else:
|
||||||
|
instruction = utterance.split("Error Summary of Test Reports:".lower())[-1].split("Note that each file must strictly follow a markdown code block format".lower())[0]
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
role = utterance.split(": **")[0]
|
||||||
|
|
||||||
|
instruction = instruction.strip()
|
||||||
|
if instruction.startswith("\""):
|
||||||
|
instruction = instruction[1:]
|
||||||
|
if instruction.endswith("\""):
|
||||||
|
instruction = instruction[:-1]
|
||||||
|
instruction = instruction.strip()
|
||||||
|
instructions.append(instruction)
|
||||||
|
|
||||||
|
role = role.strip()
|
||||||
|
roles.append(role)
|
||||||
|
|
||||||
|
for i in range(len(self.edges)):
|
||||||
|
self.edges[i].instruction = instructions[i]
|
||||||
|
self.edges[i].role = roles[i]
|
||||||
|
|
||||||
|
def find_shortest_path(self, uMID=None, vMID=None):
|
||||||
|
if uMID == None:
|
||||||
|
uMID = self.edges[0].sourceMID
|
||||||
|
if vMID == None:
|
||||||
|
vMID = self.edges[-1].targetMID
|
||||||
|
|
||||||
|
Q, visit, preMID, preEdge = Queue(), {}, {}, {}
|
||||||
|
Q.put(uMID)
|
||||||
|
visit[uMID] = True
|
||||||
|
while not Q.empty():
|
||||||
|
mID = Q.get()
|
||||||
|
if mID == vMID:
|
||||||
|
id, pathNodes, pathEdges = vMID, [], []
|
||||||
|
while id != uMID:
|
||||||
|
pathNodes.append(id)
|
||||||
|
pathEdges.append(preEdge[id])
|
||||||
|
id = preMID[id]
|
||||||
|
pathNodes.append(uMID)
|
||||||
|
pathNodes = pathNodes[::-1]
|
||||||
|
pathEdges = pathEdges[::-1]
|
||||||
|
return pathNodes, pathEdges
|
||||||
|
nextMIDs = [edge.targetMID for edge in self.edges if edge.sourceMID == mID]
|
||||||
|
nextEdges = [edge for edge in self.edges if edge.sourceMID == mID]
|
||||||
|
for i in range(len(nextMIDs)):
|
||||||
|
nextMID = nextMIDs[i]
|
||||||
|
nextEdge = nextEdges[i]
|
||||||
|
if nextMID not in visit.keys():
|
||||||
|
Q.put(nextMID)
|
||||||
|
visit[nextMID] = True
|
||||||
|
preMID[nextMID] = mID
|
||||||
|
preEdge[nextMID] = nextEdge
|
||||||
|
|
||||||
|
def print(self):
|
||||||
|
output = "\n"+"*" * 50 + " Graph " + "*" * 50 + "\n"
|
||||||
|
output += "{} Nodes:\n".format(len(self.nodes.keys()))
|
||||||
|
for key in self.nodes.keys():
|
||||||
|
node = self.nodes[key]
|
||||||
|
output += "{}, {}, {}\n".format(node.mID, node.version, node.commitMessage)
|
||||||
|
output += "{} Edges:\n".format(len(self.edges))
|
||||||
|
for edge in self.edges:
|
||||||
|
output += "{}: {} -> {} ({}: {})\n".format(edge.edgeId, edge.sourceMID, edge.targetMID, edge.role, edge.instruction[:60])
|
||||||
|
output += "*" * 50 + " Graph " + "*" * 50
|
||||||
|
log_and_print_online(output)
|
||||||
|
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
merged_node_dict = []
|
||||||
|
merged_edge_dict = []
|
||||||
|
for k,v in self.nodes.items():
|
||||||
|
merged_node_dict.append(v.__dict__)
|
||||||
|
for index,e in enumerate(self.edges):
|
||||||
|
merged_edge_dict.append(e.__dict__ )
|
||||||
|
return merged_node_dict,merged_edge_dict
|
430
ecl/memory.py
Normal file
430
ecl/memory.py
Normal file
@ -0,0 +1,430 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import openai
|
||||||
|
import faiss
|
||||||
|
import numpy as np
|
||||||
|
from datetime import datetime
|
||||||
|
sys.path.append(os.path.join(os.getcwd(),"ecl"))
|
||||||
|
#from utils import get_code_embedding,get_text_embedding
|
||||||
|
from utils import get_easyDict_from_filepath,log_and_print_online
|
||||||
|
from embedding import OpenAIEmbedding
|
||||||
|
|
||||||
|
class MemoryBase(ABC):
|
||||||
|
def __init__(self, directory: str) -> None:
|
||||||
|
self.directory: str = directory
|
||||||
|
|
||||||
|
cfg = get_easyDict_from_filepath("./ecl/config.yaml")
|
||||||
|
self.top_k_code = cfg.retrieval.top_k_code
|
||||||
|
self.top_k_text = cfg.retrieval.top_k_text
|
||||||
|
self.code_thresh = cfg.retrieval.searchcode_thresh
|
||||||
|
self.text_thresh = cfg.retrieval.searchtext_thresh
|
||||||
|
self.embedding_method = None
|
||||||
|
|
||||||
|
if cfg.embedding_method == "OpenAI":
|
||||||
|
self.embedding_method = OpenAIEmbedding()
|
||||||
|
|
||||||
|
self.content = None
|
||||||
|
if os.path.exists(self.directory) and self.directory.endswith('.json'):
|
||||||
|
with open(self.directory) as file:
|
||||||
|
self.content = json.load(file)
|
||||||
|
elif os.path.exists(self.directory) is False:
|
||||||
|
with open(self.directory, 'w') as file:
|
||||||
|
json.dump({}, file) # Create an empty JSON file
|
||||||
|
file.close()
|
||||||
|
print(f"Now the memory file '{self.directory}' is created")
|
||||||
|
if self.content is None:
|
||||||
|
print("Empty Memory")
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def memory_retrieval(self) -> str:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _get_memory_count(self) ->int:
|
||||||
|
if isinstance(self.content,list):
|
||||||
|
return self.content[-1].get("total")
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
class AllMemory(MemoryBase):
|
||||||
|
def __init__(self, directory: str):
|
||||||
|
super().__init__(directory)
|
||||||
|
|
||||||
|
# unused; init experience list
|
||||||
|
def _init_explist(self):
|
||||||
|
self.exp_list = None
|
||||||
|
if self.content == None:
|
||||||
|
self.exp_list = None
|
||||||
|
else :
|
||||||
|
for t in self.content:
|
||||||
|
for experience in t.get("experineces"):
|
||||||
|
self.exp_list.append(experience)
|
||||||
|
|
||||||
|
# clear all memory
|
||||||
|
def _memory_clear(self) ->None:
|
||||||
|
if os.path.exists(self.directory) and self.directory.endswith('.json'):
|
||||||
|
with open(self.directory) as file:
|
||||||
|
json.dump({},file)
|
||||||
|
file.close()
|
||||||
|
self.content = None
|
||||||
|
# get code sample
|
||||||
|
def get_codesample(self) ->str:
|
||||||
|
if self._get_memory_count() >=1:
|
||||||
|
return self.content[-1].get("nodes")[-1]["code"]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
# get text str sample
|
||||||
|
def get_textsample(self) ->str:
|
||||||
|
|
||||||
|
if self._get_memory_count() >=1:
|
||||||
|
return self.content[-1].get("edges")[-1].get("instruction")
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
# get code embedding from code mID
|
||||||
|
def _get_codeembedding(self,mid) :
|
||||||
|
for t in self.content:
|
||||||
|
for node in t["nodes"]:
|
||||||
|
if node["mID"] == mid:
|
||||||
|
return node.get("embedding")
|
||||||
|
# get instructionstar from sourcecode mID
|
||||||
|
def _get_instructionstar(self,mid):
|
||||||
|
max_valueGain = -1
|
||||||
|
for t in self.content:
|
||||||
|
for experience in t["experiences"]:
|
||||||
|
if experience == None :
|
||||||
|
pass
|
||||||
|
elif experience["sourceMID"] == mid:
|
||||||
|
if experience.get("valueGain") >= max_valueGain:
|
||||||
|
instructionstar = experience.get("instructionStar")
|
||||||
|
return instructionstar
|
||||||
|
|
||||||
|
# get experience task and dir from sourcecode mID
|
||||||
|
def _get_task_from_source(self,mid):
|
||||||
|
task = None
|
||||||
|
task_dir = None
|
||||||
|
for t in self.content:
|
||||||
|
for experience in t["experiences"]:
|
||||||
|
if experience == None :
|
||||||
|
pass
|
||||||
|
elif experience["sourceMID"] == mid:
|
||||||
|
task = t["task"]
|
||||||
|
task_dir = t["dir"]
|
||||||
|
return task,task_dir
|
||||||
|
|
||||||
|
# get experience task and dir from targetcode mID
|
||||||
|
def _get_task_from_target(self,mid):
|
||||||
|
task = None
|
||||||
|
task_dir = None
|
||||||
|
for t in self.content:
|
||||||
|
for experience in t["experiences"]:
|
||||||
|
if experience == None :
|
||||||
|
pass
|
||||||
|
elif experience["targetMID"] == mid:
|
||||||
|
task = t["task"]
|
||||||
|
task_dir = t["dir"]
|
||||||
|
return task,task_dir
|
||||||
|
|
||||||
|
# retrieval from MemoryCards
|
||||||
|
def memory_retrieval(self,input_message:str, type:str, k = None) :
|
||||||
|
if k == None:
|
||||||
|
if type == "code":
|
||||||
|
return self.search_code(input_message,self.top_k_code)
|
||||||
|
elif type == "text":
|
||||||
|
return self.search_text(input_message,self.top_k_text)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
if type == "code":
|
||||||
|
return self.search_code(input_message, k)
|
||||||
|
elif type == "text":
|
||||||
|
return self.search_text(input_message, k)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def search_text(self, code_query, k:int):
|
||||||
|
"""
|
||||||
|
search instructionStar from a code query
|
||||||
|
|
||||||
|
Keyword arguments:
|
||||||
|
code_query -- code input
|
||||||
|
k -- the number of instructions to search
|
||||||
|
|
||||||
|
Return:
|
||||||
|
(best k instructionStar, k)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
if self._get_memory_count() == 0 or code_query == None or k == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
else :
|
||||||
|
code_query = self.embedding_method.get_code_embedding(code_query)
|
||||||
|
if isinstance(code_query,list):
|
||||||
|
code_query=np.array(code_query,dtype=np.float32)
|
||||||
|
code_query = code_query.reshape(1,-1)
|
||||||
|
|
||||||
|
sourcecodemid_list = []# source code mid
|
||||||
|
code_embeddings = []# code embedding
|
||||||
|
|
||||||
|
for t in self.content :
|
||||||
|
for experience in t["experiences"]:
|
||||||
|
sourcecodemid_list.append(experience.get("sourceMID"))
|
||||||
|
sourcecodemid_list = list(set(sourcecodemid_list))# remove duplicates
|
||||||
|
for mid in sourcecodemid_list:
|
||||||
|
code_embeddings.append(self._get_codeembedding(mid))
|
||||||
|
code_embedding_data = np.array(code_embeddings, dtype=np.float32)
|
||||||
|
|
||||||
|
faiss.normalize_L2(code_embedding_data)
|
||||||
|
faiss.normalize_L2(code_query)
|
||||||
|
# use L2 distance(cosine distance)
|
||||||
|
index = faiss.IndexFlatL2(code_embedding_data.shape[1])
|
||||||
|
index.add(code_embedding_data)
|
||||||
|
|
||||||
|
# In Faiss, the index.search function returns the square of L2 distance by default (Squared L2 Distance)
|
||||||
|
distances, indices = index.search(code_query, k)
|
||||||
|
similarities = 1-(1/2)*distances
|
||||||
|
|
||||||
|
task_list = []
|
||||||
|
task_dir_list = []
|
||||||
|
|
||||||
|
instructionStar_list = []
|
||||||
|
sourceMIDS = []
|
||||||
|
for i in range(k):
|
||||||
|
index = indices[0][i]
|
||||||
|
similarity = similarities[0][i]
|
||||||
|
if index != -1 and similarity >= self.text_thresh:
|
||||||
|
task, task_dir = self._get_task_from_source(sourcecodemid_list[index])
|
||||||
|
sourceMIDS.append(sourcecodemid_list[index])
|
||||||
|
task_list.append(task)
|
||||||
|
task_dir_list.append(task_dir)
|
||||||
|
instructionStar_list.append(self._get_instructionstar(sourcecodemid_list[index]))
|
||||||
|
|
||||||
|
filtered_similarities = np.array2string(similarities[:,:k])
|
||||||
|
return instructionStar_list, filtered_similarities, sourceMIDS, task_list, task_dir_list
|
||||||
|
|
||||||
|
def search_code(self, text_query, k:int):
|
||||||
|
"""search best code from a text query
|
||||||
|
|
||||||
|
Keyword arguments:
|
||||||
|
text_query -- text input
|
||||||
|
k -- the number of code to search
|
||||||
|
Return: (best k code, k)
|
||||||
|
"""
|
||||||
|
|
||||||
|
if self._get_memory_count() == 0 or text_query == None or k == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
else :
|
||||||
|
text_query = self.embedding_method.get_text_embedding(text_query)
|
||||||
|
if isinstance(text_query,list):
|
||||||
|
text_query=np.array(text_query,dtype=np.float32)
|
||||||
|
text_query = text_query.reshape(1,-1)
|
||||||
|
|
||||||
|
text_embeddings = [exp.get("embedding") for t in self.content for exp in t["experiences"]]
|
||||||
|
text_embedding_data = np.array(text_embeddings, dtype=np.float32)
|
||||||
|
|
||||||
|
faiss.normalize_L2(text_embedding_data)
|
||||||
|
faiss.normalize_L2(text_query)
|
||||||
|
# use L2 distance(cosine distance)
|
||||||
|
total_instructionStar = text_embedding_data.shape[0]
|
||||||
|
index = faiss.IndexFlatL2(text_embedding_data.shape[1])
|
||||||
|
index.add(text_embedding_data)
|
||||||
|
# In Faiss, the index.search function returns the square of L2 distance by default (Squared L2 Distance)
|
||||||
|
distances, indices = index.search(text_query, total_instructionStar)
|
||||||
|
|
||||||
|
|
||||||
|
similarities = 1-(1/2)*distances
|
||||||
|
|
||||||
|
code_node_list = [node for t in self.content for node in t["nodes"]]
|
||||||
|
targetMIDs = []
|
||||||
|
target_code = []
|
||||||
|
task_list = []
|
||||||
|
task_dir_list = []
|
||||||
|
filtered_similarities = []
|
||||||
|
experience_list = [experience for t in self.content for experience in t["experiences"]]
|
||||||
|
counter = 0
|
||||||
|
|
||||||
|
added_set = set()
|
||||||
|
for i in range(total_instructionStar):
|
||||||
|
index = indices[0][i]
|
||||||
|
similarity = similarities[0][i]
|
||||||
|
if index != -1 and counter < k:
|
||||||
|
if similarity <= self.code_thresh:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
mid = experience_list[index].get("targetMID")
|
||||||
|
if mid not in added_set:
|
||||||
|
targetMIDs.append(mid)
|
||||||
|
added_set.add(mid)
|
||||||
|
counter += 1
|
||||||
|
filtered_similarities.append(str(similarity))
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
for targetMID in targetMIDs:
|
||||||
|
for code_node in code_node_list:
|
||||||
|
if targetMID == code_node.get("mID"):
|
||||||
|
target_code.append(code_node.get("code"))
|
||||||
|
task, task_dir = self._get_task_from_target(targetMID)
|
||||||
|
task_list.append(task)
|
||||||
|
task_dir_list.append(task_dir)
|
||||||
|
filtered_similarities = ",".join(filtered_similarities)
|
||||||
|
return target_code, filtered_similarities, targetMIDs, task_list, task_dir_list
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Memory:
|
||||||
|
def __init__(self):
|
||||||
|
self.directory: str = None
|
||||||
|
self.id_enabled : bool = False
|
||||||
|
self.user_memory_filepath: str = None
|
||||||
|
self.assistant_memory_filepath: str = None
|
||||||
|
|
||||||
|
self.update_count = 0
|
||||||
|
self.memory_keys: List[str] = ["All"]
|
||||||
|
self.memory_data = {}
|
||||||
|
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
if self.memory_data.get("All") == None:
|
||||||
|
return "No existed memory"
|
||||||
|
else:
|
||||||
|
return "Current memory length:{}".format(self.memory_data["All"]._get_memory_count())
|
||||||
|
|
||||||
|
def _set_embedding(self,experience):
|
||||||
|
graph = experience.graph
|
||||||
|
edge_start_time = time.time()
|
||||||
|
for edge in graph.edges:
|
||||||
|
if edge.embedding is None:
|
||||||
|
start_time =time.time()
|
||||||
|
edge.embedding = self.memory_data["All"].embedding_method.get_text_embedding(edge.instruction)
|
||||||
|
end_time = time.time()
|
||||||
|
log_and_print_online("DONE: get edge embedding\ntime cost:{}\n".format(end_time-start_time))
|
||||||
|
edge_duration = time.time() - edge_start_time
|
||||||
|
log_and_print_online("DONE: got all EDGE embeddings\nEDGE embedding time cost:{}\n".format(edge_duration))
|
||||||
|
node_start_time = time.time()
|
||||||
|
for node_id in graph.nodes:
|
||||||
|
node = graph.nodes[node_id]
|
||||||
|
if node.embedding is None:
|
||||||
|
start_time = time.time()
|
||||||
|
node.embedding = self.memory_data["All"].embedding_method.get_code_embedding(node.code)
|
||||||
|
end_time = time.time()
|
||||||
|
log_and_print_online("DONE: get node embedding\ntime cost:{}\n".format(end_time-start_time))
|
||||||
|
node_duration = ( time.time() - node_start_time)
|
||||||
|
log_and_print_online("DONE: got all NODE embeddings\nNODE embedding time cost:{}\n".format(node_duration))
|
||||||
|
exp_start_time = time.time()
|
||||||
|
for exp in experience.experiences:
|
||||||
|
if exp.embedding is None:
|
||||||
|
start_time = time.time()
|
||||||
|
exp.embedding = self.memory_data["All"].embedding_method.get_text_embedding(exp.instructionStar)
|
||||||
|
end_time = time.time()
|
||||||
|
log_and_print_online("DONE: get exprience embedding\ntime cost:{}\n".format(end_time-start_time))
|
||||||
|
exp_duration = ( time.time() - exp_start_time)
|
||||||
|
log_and_print_online("DONE: got all EXPERIENCE embeddings\nEXPERIENCE embedding time cost:{}\n".format(exp_duration))
|
||||||
|
duration = edge_duration + node_duration + exp_duration
|
||||||
|
log_and_print_online("All embedding DONE\ntime cost:{}\n".format(duration))
|
||||||
|
|
||||||
|
# create memory path and upload memory from existed memory
|
||||||
|
def upload(self):
|
||||||
|
self.directory = os.path.join(os.getcwd(),"ecl","memory")
|
||||||
|
if os.path.exists(self.directory) is False:
|
||||||
|
os.mkdir(self.directory)
|
||||||
|
for key in self.memory_keys:
|
||||||
|
if key =="All":
|
||||||
|
path = os.path.join(self.directory,"MemoryCards.json")
|
||||||
|
self.memory_data[key] = AllMemory(path)
|
||||||
|
|
||||||
|
# upload experience into memory
|
||||||
|
def upload_from_experience(self, experience):
|
||||||
|
self._set_embedding(experience)
|
||||||
|
with open(self.memory_data["All"].directory, 'w') as file:
|
||||||
|
node_data,edge_data = experience.graph.to_dict()
|
||||||
|
experience_data = experience.to_dict()
|
||||||
|
|
||||||
|
merged_dic = []
|
||||||
|
index = 0
|
||||||
|
previous_memory = []
|
||||||
|
|
||||||
|
if self.memory_data["All"].content != None and len(self.memory_data["All"].content) != 0 :
|
||||||
|
previous_memory = self.memory_data["All"].content
|
||||||
|
log_and_print_online("len(previous_memory)={}".format(len(previous_memory)))
|
||||||
|
if len(previous_memory) != 0 and isinstance(previous_memory,list):
|
||||||
|
for index,t in enumerate(previous_memory):
|
||||||
|
if isinstance(t,list):
|
||||||
|
for subindex,subt in enumerate(t):
|
||||||
|
if len(subt)!=0:
|
||||||
|
merged_dic.append(subt)
|
||||||
|
elif len(t)!=0 :
|
||||||
|
merged_dic.append(t)
|
||||||
|
index = merged_dic[-1]["total"]
|
||||||
|
elif len(previous_memory) != 0 :
|
||||||
|
merged_dic.append(previous_memory)
|
||||||
|
index = 1
|
||||||
|
|
||||||
|
# remove duplication
|
||||||
|
dirList = [t["dir"] for t in merged_dic]
|
||||||
|
|
||||||
|
combined_json_str = {}
|
||||||
|
combined_json_str["index"] = index
|
||||||
|
combined_json_str["dir"] = experience.graph.directory
|
||||||
|
combined_json_str["task"] = experience.graph.task
|
||||||
|
combined_json_str["nodes"] = node_data
|
||||||
|
combined_json_str["edges"] = edge_data
|
||||||
|
combined_json_str["experiences"] = experience_data
|
||||||
|
combined_json_str["total"] = combined_json_str["index"]+1
|
||||||
|
|
||||||
|
if self.memory_data["All"].content != None and len(self.memory_data["All"].content)!=0:
|
||||||
|
merged_dic.append(combined_json_str)
|
||||||
|
else :
|
||||||
|
merged_dic.append(combined_json_str)
|
||||||
|
|
||||||
|
json.dump(merged_dic, file)
|
||||||
|
log_and_print_online("len(merged_dic)={}".format(len(merged_dic))+"\n merged_dic dumped to {}".format(self.memory_data["All"].directory))
|
||||||
|
log_and_print_online("[Conclusion]:\ntext_prompt_tokens:{}, text_total_tokens:{}\ncode_prompt_tokens:{}, code_total_tokens:{}\nprompt_tokens:{}, total_tokens:{}".format(self.memory_data["All"].embedding_method.text_prompt_tokens,
|
||||||
|
self.memory_data["All"].embedding_method.text_total_tokens,
|
||||||
|
self.memory_data["All"].embedding_method.code_prompt_tokens,
|
||||||
|
self.memory_data["All"].embedding_method.code_total_tokens,
|
||||||
|
self.memory_data["All"].embedding_method.prompt_tokens,
|
||||||
|
self.memory_data["All"].embedding_method.total_tokens))
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
# delete memory from index
|
||||||
|
def delete_memroy(self,idx:int):
|
||||||
|
with open(self.memory_data["All"].directory, 'w') as file:
|
||||||
|
merged_dic = []
|
||||||
|
index = 0
|
||||||
|
previous_memory = []
|
||||||
|
|
||||||
|
if self.memory_data["All"].content != None and len(self.memory_data["All"].content) != 0 :
|
||||||
|
previous_memory = self.memory_data["All"].content
|
||||||
|
if len(previous_memory) != 0 and isinstance(previous_memory,list):
|
||||||
|
for index,t in enumerate(previous_memory):
|
||||||
|
if isinstance(t,list):
|
||||||
|
for subindex,subt in enumerate(t):
|
||||||
|
if len(subt)!=0:
|
||||||
|
merged_dic.append(subt)
|
||||||
|
elif len(t)!=0 :
|
||||||
|
merged_dic.append(t)
|
||||||
|
index = merged_dic[-1]["total"]
|
||||||
|
elif len(previous_memory) != 0 :
|
||||||
|
merged_dic.append(previous_memory)
|
||||||
|
index = 1
|
||||||
|
|
||||||
|
if idx >= len(merged_dic):
|
||||||
|
json.dump(merged_dic,file)
|
||||||
|
else :
|
||||||
|
merged_dic.pop(idx)
|
||||||
|
json.dump(merged_dic,file)
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
1
ecl/memory/MemoryCards.json
Normal file
1
ecl/memory/MemoryCards.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{}
|
50
ecl/post_process/memory_filter.py
Normal file
50
ecl/post_process/memory_filter.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
filter_threshold = 0.9
|
||||||
|
|
||||||
|
def filter_valuegain(directory, filtered_directory):
|
||||||
|
"""filter memory by experience's valueGain, delete experience whose valueGain is smaller than filter_threshold
|
||||||
|
|
||||||
|
Keyword arguments:
|
||||||
|
directory -- the input directory of MemoryCards, like "./ecl/memory/MemoryCards.json"
|
||||||
|
filtered_directory -- the output directory of filtered MemoryCards, like "./ecl/memory/MemoryCards.json"
|
||||||
|
"""
|
||||||
|
with open(directory) as file:
|
||||||
|
content = json.load(file)
|
||||||
|
new_content = []
|
||||||
|
for memorypiece in content:
|
||||||
|
experiences = memorypiece.get("experiences")
|
||||||
|
filtered_experienceList = []
|
||||||
|
|
||||||
|
if experiences != None:
|
||||||
|
print("origin:",len(experiences))
|
||||||
|
for experience in experiences:
|
||||||
|
valueGain = experience.get("valueGain")
|
||||||
|
print(valueGain)
|
||||||
|
if valueGain >= filter_threshold:
|
||||||
|
filtered_experienceList.append(experience)
|
||||||
|
print(len(experiences))
|
||||||
|
memorypiece["experiences"] = filtered_experienceList
|
||||||
|
new_content.append(memorypiece)
|
||||||
|
else:
|
||||||
|
new_content.append(memorypiece)
|
||||||
|
file.close()
|
||||||
|
with open(filtered_directory, 'w') as file:
|
||||||
|
json.dump(content, file)
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Process some directories.")
|
||||||
|
parser.add_argument("threshold", type=float, help="The filtered threshold for experiences")
|
||||||
|
parser.add_argument("directory", type = str, help="The directory to process")
|
||||||
|
parser.add_argument("filtered_directory", type= str, help="The directory for output")
|
||||||
|
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
filter_threshold = args.threshold
|
||||||
|
filter_valuegain(args.directory, args.filtered_directory)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
176
ecl/utils.py
Normal file
176
ecl/utils.py
Normal file
@ -0,0 +1,176 @@
|
|||||||
|
import subprocess
|
||||||
|
import json
|
||||||
|
import yaml
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
from easydict import EasyDict
|
||||||
|
import openai
|
||||||
|
from openai import OpenAI
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
import tiktoken
|
||||||
|
from typing import Any, Dict
|
||||||
|
from tenacity import (
|
||||||
|
retry,
|
||||||
|
stop_after_attempt,
|
||||||
|
wait_exponential
|
||||||
|
)
|
||||||
|
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
|
||||||
|
if 'BASE_URL' in os.environ:
|
||||||
|
BASE_URL = os.environ['BASE_URL']
|
||||||
|
else:
|
||||||
|
BASE_URL = None
|
||||||
|
|
||||||
|
def getFilesFromType(sourceDir, filetype):
|
||||||
|
files = []
|
||||||
|
for root, directories, filenames in os.walk(sourceDir):
|
||||||
|
for filename in filenames:
|
||||||
|
if filename.endswith(filetype):
|
||||||
|
files.append(os.path.join(root, filename))
|
||||||
|
return files
|
||||||
|
|
||||||
|
def cmd(command: str):
|
||||||
|
print(">> {}".format(command))
|
||||||
|
text = subprocess.run(command, shell=True, text=True, stdout=subprocess.PIPE).stdout
|
||||||
|
return text
|
||||||
|
|
||||||
|
def get_easyDict_from_filepath(path: str):
|
||||||
|
# print(path)
|
||||||
|
if path.endswith('.json'):
|
||||||
|
with open(path, 'r', encoding="utf-8") as file:
|
||||||
|
config_map = json.load(file, strict=False)
|
||||||
|
config_easydict = EasyDict(config_map)
|
||||||
|
return config_easydict
|
||||||
|
if path.endswith('.yaml'):
|
||||||
|
file_data = open(path, 'r', encoding="utf-8").read()
|
||||||
|
config_map = yaml.load(file_data, Loader=yaml.FullLoader)
|
||||||
|
config_easydict = EasyDict(config_map)
|
||||||
|
return config_easydict
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def calc_max_token(messages, model):
|
||||||
|
string = "\n".join([message["content"] for message in messages])
|
||||||
|
encoding = tiktoken.encoding_for_model(model)
|
||||||
|
num_prompt_tokens = len(encoding.encode(string))
|
||||||
|
gap_between_send_receive = 50
|
||||||
|
num_prompt_tokens += gap_between_send_receive
|
||||||
|
|
||||||
|
num_max_token_map = {
|
||||||
|
"gpt-3.5-turbo": 4096,
|
||||||
|
"gpt-3.5-turbo-16k": 16384,
|
||||||
|
"gpt-3.5-turbo-0613": 4096,
|
||||||
|
"gpt-3.5-turbo-16k-0613": 16384,
|
||||||
|
"gpt-4": 8192,
|
||||||
|
"gpt-4-0613": 8192,
|
||||||
|
"gpt-4-32k": 32768,
|
||||||
|
}
|
||||||
|
num_max_token = num_max_token_map[model]
|
||||||
|
num_max_completion_tokens = num_max_token - num_prompt_tokens
|
||||||
|
return num_max_completion_tokens
|
||||||
|
|
||||||
|
|
||||||
|
class ModelBackend(ABC):
|
||||||
|
r"""Base class for different model backends.
|
||||||
|
May be OpenAI API, a local LLM, a stub for unit tests, etc."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def run(self, *args, **kwargs) -> Dict[str, Any]:
|
||||||
|
r"""Runs the query to the backend model.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: if the return value from OpenAI API
|
||||||
|
is not a dict that is expected.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, Any]: All backends must return a dict in OpenAI format.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class OpenAIModel(ModelBackend):
|
||||||
|
r"""OpenAI API in a unified ModelBackend interface."""
|
||||||
|
|
||||||
|
def __init__(self, model_type, model_config_dict: Dict=None) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.model_type = model_type
|
||||||
|
self.model_config_dict = model_config_dict
|
||||||
|
if self.model_config_dict == None:
|
||||||
|
self.model_config_dict = {"temperature": 0.2,
|
||||||
|
"top_p": 1.0,
|
||||||
|
"n": 1,
|
||||||
|
"stream": False,
|
||||||
|
"frequency_penalty": 0.0,
|
||||||
|
"presence_penalty": 0.0,
|
||||||
|
"logit_bias": {},
|
||||||
|
}
|
||||||
|
self.prompt_tokens = 0
|
||||||
|
self.completion_tokens = 0
|
||||||
|
self.total_tokens = 0
|
||||||
|
|
||||||
|
@retry(wait=wait_exponential(min=5, max=60), stop=stop_after_attempt(5))
|
||||||
|
def run(self, messages) :
|
||||||
|
if BASE_URL:
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key=OPENAI_API_KEY,
|
||||||
|
base_url=BASE_URL,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key=OPENAI_API_KEY
|
||||||
|
)
|
||||||
|
current_retry = 0
|
||||||
|
max_retry = 5
|
||||||
|
|
||||||
|
string = "\n".join([message["content"] for message in messages])
|
||||||
|
encoding = tiktoken.encoding_for_model(self.model_type)
|
||||||
|
num_prompt_tokens = len(encoding.encode(string))
|
||||||
|
gap_between_send_receive = 15 * len(messages)
|
||||||
|
num_prompt_tokens += gap_between_send_receive
|
||||||
|
|
||||||
|
num_max_token_map = {
|
||||||
|
"gpt-3.5-turbo": 4096,
|
||||||
|
"gpt-3.5-turbo-16k": 16384,
|
||||||
|
"gpt-3.5-turbo-0613": 4096,
|
||||||
|
"gpt-3.5-turbo-16k-0613": 16384,
|
||||||
|
"gpt-4": 8192,
|
||||||
|
"gpt-4-0613": 8192,
|
||||||
|
"gpt-4-32k": 32768,
|
||||||
|
}
|
||||||
|
response = client.chat.completions.create(messages = messages,
|
||||||
|
model = "gpt-3.5-turbo-16k",
|
||||||
|
temperature = 0.2,
|
||||||
|
top_p = 1.0,
|
||||||
|
n = 1,
|
||||||
|
stream = False,
|
||||||
|
frequency_penalty = 0.0,
|
||||||
|
presence_penalty = 0.0,
|
||||||
|
logit_bias = {},
|
||||||
|
).model_dump()
|
||||||
|
response_text = response['choices'][0]['message']['content']
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
num_max_token = num_max_token_map[self.model_type]
|
||||||
|
num_max_completion_tokens = num_max_token - num_prompt_tokens
|
||||||
|
self.model_config_dict['max_tokens'] = num_max_completion_tokens
|
||||||
|
log_and_print_online(
|
||||||
|
"InstructionStar generation:\n**[OpenAI_Usage_Info Receive]**\nprompt_tokens: {}\ncompletion_tokens: {}\ntotal_tokens: {}\n".format(
|
||||||
|
response["usage"]["prompt_tokens"], response["usage"]["completion_tokens"],
|
||||||
|
response["usage"]["total_tokens"]))
|
||||||
|
self.prompt_tokens += response["usage"]["prompt_tokens"]
|
||||||
|
self.completion_tokens += response["usage"]["completion_tokens"]
|
||||||
|
self.total_tokens += response["usage"]["total_tokens"]
|
||||||
|
|
||||||
|
if not isinstance(response, Dict):
|
||||||
|
raise RuntimeError("Unexpected return from OpenAI API")
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
def now():
|
||||||
|
return time.strftime("%Y%m%d%H%M%S", time.localtime())
|
||||||
|
|
||||||
|
def log_and_print_online(content=None):
|
||||||
|
if content is not None:
|
||||||
|
print(content)
|
||||||
|
logging.info(content)
|
66
wiki.md
66
wiki.md
@ -131,6 +131,62 @@ then start building a software by ``python3 run.py`` and go to [Visualizer Websi
|
|||||||
### Official Docker Image
|
### Official Docker Image
|
||||||
- in preparation
|
- in preparation
|
||||||
|
|
||||||
|
## Experiential Co-Learning Guide
|
||||||
|
### Co-Tracking
|
||||||
|
|
||||||
|
- **Start Co-Tracking**: Use the following command to initiate the building of software, replacing `[description_of_your_idea]` with task descirption and `[project_name]` with project name. This is the same as starting ChatDev.
|
||||||
|
```bash
|
||||||
|
python3 run.py --task "[description_of_your_idea]" --name "[project_name]"
|
||||||
|
```
|
||||||
|
The software generated in co-tracking phase is ready for the agents' experience pool in the following steps.
|
||||||
|
### Co-Memorizing
|
||||||
|
- **Initiating Co-Memorizing**: To begin the memorization process for the generated software in a specified directory, run the `ecl.py` script using the following command:
|
||||||
|
```bash
|
||||||
|
python3 ecl/ecl.py "<path>" "[options]"
|
||||||
|
```
|
||||||
|
`<path>`: The path to the file or directory to process.
|
||||||
|
`[options]`: This can be set as `-d`. This flag indicates that the script should process all files in the given directory. If this flag is not set, the script will process the file specified in path.
|
||||||
|
After this process, the experiences have been extracted from the production of software and added to the agents' experience pool in `ecl/memory/MemoryCards.json`.
|
||||||
|
\
|
||||||
|
**For example:**
|
||||||
|
It you want to memorize only one software, you can use:
|
||||||
|
```bash
|
||||||
|
python3 ecl/ecl.py "<Software Path to file>"
|
||||||
|
```
|
||||||
|
And the software path should be like `"WareHouse/project_name_DefaultOrganization_timestamp"`.
|
||||||
|
\
|
||||||
|
If you want to memorize all files in a directory, you can use:
|
||||||
|
```bash
|
||||||
|
python3 ecl/ecl.py "<Software Path to Directory>" -d
|
||||||
|
```
|
||||||
|
the software path should be like `"WareHouse"`.
|
||||||
|
- **Memory Filter**: To get a higher quality experience pool, it is suggested to use `ecl/post_process/memory_filter.py` to filter the `MemoryCards.json`. When running the `memory_filter.py` script, you need to specify three arguments: the filter threshold, the input directory, and the output directory.
|
||||||
|
```bash
|
||||||
|
python3 ecl/post_process/memory_filter.py "<threshold>" "<directory>" "<filtered_directory>"
|
||||||
|
```
|
||||||
|
- `<threshold>`: Require a value within the range of 0 to 1 (exclusive). It is used as the threshold to filter experiences by their 'valuegain'. Only experiences with a 'valuegain' that is equal to or greater than this threshold will be considered.
|
||||||
|
- `<directory>`: The file path to the memory directory that you intend to process.
|
||||||
|
- `<filtered_directory>`: The file path to a directory where you want to store the processed data.
|
||||||
|
|
||||||
|
\
|
||||||
|
**For example:**
|
||||||
|
```bash
|
||||||
|
python3 ecl/post_process/memory_filter.py 0.9 "ecl/memory/MemoryCards.json" "ecl/memory/MemoryCards_filtered.json"
|
||||||
|
```
|
||||||
|
> **Notice:** By default, the `MemoryCards.json` is set to be empty. You can customize your own experience pool for agents following steps above. And we have also provided our `MemoryCards.json` used in our experiment in [MemoryCards.json](https://drive.google.com/drive/folders/1czsR4swQyqpoN8zwN0-rSFcTVl68zTDY?usp=sharing). You can download the json file through the link and put it under `ecl/memory` folder. This allows you to directly proceed to the Co-Reasoning phase without needing to redo the Co-Tracking and Co-Memorizing steps.
|
||||||
|
### Co-Reasoning
|
||||||
|
- **Memory Usage Configuration**:
|
||||||
|
In the `CompanyConfig/Default/ChatChainConfig.json` file, the `with_memory` option should be set **True**. \
|
||||||
|
In the `ecl/config.yaml` file, you can adjust the settings for **top k** and **similarity threshold** for both code and text retrieval.
|
||||||
|
By default, `with_memory` is set as False and the system is configured to retrieve the top 1 result with a similarity threshold of zero for both code and text.
|
||||||
|
- **Start Co-Reasoning**: Once you have completed memory usage configuration, similar to the Co-Tracking phase, you can use the command below to start the software building process. Replace `[description_of_your_idea]` with the task description from the test set and `[project_name]` with the project name from the test set:
|
||||||
|
```
|
||||||
|
python3 run.py --task "[description_of_your_idea]" --name "[project_name]"
|
||||||
|
```
|
||||||
|
In this process of software development, the agents will engage their experience pool(`MemoryCards.json`) into software development!
|
||||||
|
|
||||||
|
Detailed descriptions and experiment results about this **Experiential Co-Learning** Module lies in our preprint paper at https://arxiv.org/abs/2312.17025.
|
||||||
|
|
||||||
## Customization
|
## Customization
|
||||||
|
|
||||||
- You can customize your company in three kinds of granularity:
|
- You can customize your company in three kinds of granularity:
|
||||||
@ -278,6 +334,7 @@ then start building a software by ``python3 run.py`` and go to [Visualizer Websi
|
|||||||
- *self_improve*: flag for self-improvement on user input prompt. It is a special chat that LLM plays as a prompt engineer to improve the user input prompt. **⚠️ Attention** Model generated prompts contain uncertainty and there may
|
- *self_improve*: flag for self-improvement on user input prompt. It is a special chat that LLM plays as a prompt engineer to improve the user input prompt. **⚠️ Attention** Model generated prompts contain uncertainty and there may
|
||||||
be a deviation from the requirement meaning contained in the original prompt.
|
be a deviation from the requirement meaning contained in the original prompt.
|
||||||
- *background_prompt*: background prompt that will be added to every inquiry to LLM
|
- *background_prompt*: background prompt that will be added to every inquiry to LLM
|
||||||
|
- *with_memory*: Whether to utilize the experience pool for agents. The experience pool actually lies in in `ecl/memory/MemoryCards.json`.
|
||||||
- params in SimplePhase:
|
- params in SimplePhase:
|
||||||
- *max_turn_step*: Max number of chatting turn. You can increase max_turn_step for better performance but it will
|
- *max_turn_step*: Max number of chatting turn. You can increase max_turn_step for better performance but it will
|
||||||
take a longer time to finish the phase.
|
take a longer time to finish the phase.
|
||||||
@ -290,10 +347,11 @@ then start building a software by ``python3 run.py`` and go to [Visualizer Websi
|
|||||||
|
|
||||||
```commandline
|
```commandline
|
||||||
├── CompanyConfig # Configuration Files for ChatDev, including ChatChain, Phase and Role config json.
|
├── CompanyConfig # Configuration Files for ChatDev, including ChatChain, Phase and Role config json.
|
||||||
├── WareHouse # Folder for generated software
|
├── WareHouse # Folder for Generated Software
|
||||||
├── camel # Camel RolePlay component
|
├── camel # Camel RolePlay Component
|
||||||
├── chatdev # ChatDev core code
|
├── chatdev # ChatDev Core Code
|
||||||
├── misc # assets of example and demo
|
├── ecl # Experiential Co-Learning Module
|
||||||
|
├── misc # Assets of Example and Demo
|
||||||
├── visualizer # Visualizer Folder
|
├── visualizer # Visualizer Folder
|
||||||
├── run.py # Entry of ChatDev
|
├── run.py # Entry of ChatDev
|
||||||
├── requirements.txt
|
├── requirements.txt
|
||||||
|
Loading…
Reference in New Issue
Block a user