Merge pull request #337 from NA-Wen/main

Integrate Experiential Co-Learning Module
This commit is contained in:
Chen Qian 2024-01-25 14:04:32 +08:00 committed by GitHub
commit dc6dd4284d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
33 changed files with 20654 additions and 18902 deletions

View File

@ -1,6 +1,5 @@
{
"chain": [
{
"chain": [{
"phase": "DemandAnalysis",
"phaseType": "SimplePhase",
"max_turn_step": -1,
@ -22,21 +21,18 @@
"phase": "CodeCompleteAll",
"phaseType": "ComposedPhase",
"cycleNum": 10,
"Composition": [
{
"Composition": [{
"phase": "CodeComplete",
"phaseType": "SimplePhase",
"max_turn_step": 1,
"need_reflect": "False"
}
]
}]
},
{
"phase": "CodeReview",
"phaseType": "ComposedPhase",
"cycleNum": 3,
"Composition": [
{
"Composition": [{
"phase": "CodeReviewComment",
"phaseType": "SimplePhase",
"max_turn_step": 1,
@ -54,8 +50,7 @@
"phase": "Test",
"phaseType": "ComposedPhase",
"cycleNum": 3,
"Composition": [
{
"Composition": [{
"phase": "TestErrorSummary",
"phaseType": "SimplePhase",
"max_turn_step": 1,
@ -99,5 +94,6 @@
"web_spider": "False",
"self_improve": "False",
"incremental_develop": "False",
"with_memory": "False",
"background_prompt": "ChatDev is a software company powered by multiple intelligent agents, such as chief executive officer, chief human resources officer, chief product officer, chief technology officer, etc, with a multi-agent organizational structure and the mission of 'changing the digital world through programming'."
}

View File

@ -27,7 +27,8 @@
</p>
## 🎉 News
* **December 28, 2023: We present Experiential Co-Learning, an innovative approach where instructor and assistant agents accumulate shortcut-oriented experiences to effectively solve new tasks, reducing repetitive errors and enhancing efficiency. Check out our preprint paper at https://arxiv.org/abs/2312.17025 and this technique will soon be integrated into ChatDev.**
* **January 25, 2024: We integrate Experiential Co-Learning Module into ChatDev. Please see the [Experiential Co-Learning Guide](wiki.md#co-tracking).**
* December 28, 2023: We present Experiential Co-Learning, an innovative approach where instructor and assistant agents accumulate shortcut-oriented experiences to effectively solve new tasks, reducing repetitive errors and enhancing efficiency. Check out our preprint paper at https://arxiv.org/abs/2312.17025 and this technique will soon be integrated into ChatDev.
<p align="center">
<img src='./misc/ecl.png' width=860>
</p>

@ -1 +0,0 @@
Subproject commit e0396448114be2e320564cdfbe6bcf4082dd4e42

View File

@ -28,7 +28,7 @@ from camel.utils import (
num_tokens_from_messages,
openai_api_key_required,
)
from chatdev.utils import log_visualize
try:
from openai.types.chat import ChatCompletion
@ -74,6 +74,7 @@ class ChatAgent(BaseAgent):
Args:
system_message (SystemMessage): The system message for the chat agent.
with_memory(bool): The memory setting of the chat agent.
model (ModelType, optional): The LLM model to use for generating
responses. (default :obj:`ModelType.GPT_3_5_TURBO`)
model_config (Any, optional): Configuration options for the LLM model.
@ -86,6 +87,7 @@ class ChatAgent(BaseAgent):
def __init__(
self,
system_message: SystemMessage,
memory = None,
model: Optional[ModelType] = None,
model_config: Optional[Any] = None,
message_window_size: Optional[int] = None,
@ -102,6 +104,10 @@ class ChatAgent(BaseAgent):
self.terminated: bool = False
self.info: bool = False
self.init_messages()
if memory !=None and self.role_name in["Code Reviewer","Programmer","Software Test Engineer"]:
self.memory = memory.memory_data.get("All")
else:
self.memory = None
def reset(self) -> List[MessageType]:
r"""Resets the :obj:`ChatAgent` to its initial state and returns the
@ -159,6 +165,41 @@ class ChatAgent(BaseAgent):
"""
self.stored_messages.append(message)
return self.stored_messages
def use_memory(self,input_message) -> List[MessageType]:
if self.memory is None :
return None
else:
if self.role_name == "Programmer":
result = self.memory.memory_retrieval(input_message,"code")
if result != None:
target_memory,distances, mids,task_list,task_dir_list = result
if target_memory != None and len(target_memory) != 0:
target_memory="".join(target_memory)
#self.stored_messages[-1].content = self.stored_messages[-1].content+"Here is some code you've previously completed:"+target_memory+"You can refer to the previous script to complement this task."
log_visualize(self.role_name,
"thinking back and found some related code: \n--------------------------\n"
+ target_memory)
else:
target_memory = None
log_visualize(self.role_name,
"thinking back but find nothing useful")
else:
result = self.memory.memory_retrieval(input_message, "text")
if result != None:
target_memory, distances, mids, task_list, task_dir_list = result
if target_memory != None and len(target_memory) != 0:
target_memory=";".join(target_memory)
#self.stored_messages[-1].content = self.stored_messages[-1].content+"Here are some effective and efficient instructions you have sent to the assistant :"+target_memory+"You can refer to these previous excellent instructions to better instruct assistant here."
log_visualize(self.role_name,
"thinking back and found some related text: \n--------------------------\n"
+ target_memory)
else:
target_memory = None
log_visualize(self.role_name,
"thinking back but find nothing useful")
return target_memory
@retry(wait=wait_exponential(min=5, max=60), stop=stop_after_attempt(5))
@openai_api_key_required

View File

@ -90,13 +90,16 @@ class RolePlaying:
sys_msg_generator_kwargs: Optional[Dict] = None,
extend_sys_msg_meta_dicts: Optional[List[Dict]] = None,
extend_task_specify_meta_dict: Optional[Dict] = None,
background_prompt: Optional[str] = ""
background_prompt: Optional[str] = "",
memory = None,
) -> None:
self.with_task_specify = with_task_specify
self.with_task_planner = with_task_planner
self.with_critic_in_the_loop = with_critic_in_the_loop
self.model_type = model_type
self.task_type = task_type
self.memory = memory
if with_task_specify:
task_specify_meta_dict = dict()
@ -148,9 +151,9 @@ class RolePlaying:
meta_dict=sys_msg_meta_dicts[1],
content=user_role_prompt.format(**sys_msg_meta_dicts[1]))
self.assistant_agent: ChatAgent = ChatAgent(self.assistant_sys_msg, model_type,
self.assistant_agent: ChatAgent = ChatAgent(self.assistant_sys_msg, memory, model_type,
**(assistant_agent_kwargs or {}), )
self.user_agent: ChatAgent = ChatAgent(self.user_sys_msg, model_type, **(user_agent_kwargs or {}), )
self.user_agent: ChatAgent = ChatAgent(self.user_sys_msg,memory, model_type, **(user_agent_kwargs or {}), )
if with_critic_in_the_loop:
raise ValueError("with_critic_in_the_loop not available")
@ -187,6 +190,9 @@ class RolePlaying:
content = phase_prompt.format(
**({"assistant_role": self.assistant_agent.role_name} | placeholders)
)
retrieval_memory = self.assistant_agent.use_memory(content)
if retrieval_memory!= None:
placeholders["examples"] = retrieval_memory
user_msg = UserChatMessage(
role_name=self.user_sys_msg.role_name,
role="user",

View File

@ -70,7 +70,9 @@ class ChatChain:
gui_design=check_bool(self.config["gui_design"]),
git_management=check_bool(self.config["git_management"]),
incremental_develop=check_bool(self.config["incremental_develop"]),
background_prompt=self.config["background_prompt"])
background_prompt=self.config["background_prompt"],
with_memory=check_bool(self.config["with_memory"]))
self.chat_env = ChatEnv(self.chat_env_config)
# the user input prompt will be self-improved (if set "self_improve": "True" in ChatChainConfig.json)
@ -204,6 +206,9 @@ class ChatChain:
software_path = os.path.join(directory, "_".join([self.project_name, self.org_name, self.start_time]))
self.chat_env.set_directory(software_path)
if self.chat_env.config.with_memory is True:
self.chat_env.init_memory()
# copy config files to software path
shutil.copy(self.config_path, software_path)
shutil.copy(self.config_phase_path, software_path)

View File

@ -13,6 +13,7 @@ from chatdev.codes import Codes
from chatdev.documents import Documents
from chatdev.roster import Roster
from chatdev.utils import log_visualize
from ecl.memory import Memory
try:
from openai.types.chat.chat_completion_message_tool_call import ChatCompletionMessageToolCall
@ -28,15 +29,18 @@ class ChatEnvConfig:
gui_design,
git_management,
incremental_develop,
background_prompt):
background_prompt,
with_memory):
self.clear_structure = clear_structure # Whether to clear non-software files in the WareHouse and cache files in generated software path
self.gui_design = gui_design # Encourage ChatDev generate software with GUI
self.git_management = git_management # Whether to use git to manage the creation and changes of generated software
self.incremental_develop = incremental_develop # Whether to use incremental develop on an existing project
self.background_prompt = background_prompt # background prompt that will be added to every inquiry to LLM
self.with_memory = with_memory # Wheter to use memroy in the interaction between agents
def __str__(self):
string = ""
string += "ChatEnvConfig.with_memory: {}\n".format(self.with_memory)
string += "ChatEnvConfig.clear_structure: {}\n".format(self.clear_structure)
string += "ChatEnvConfig.git_management: {}\n".format(self.git_management)
string += "ChatEnvConfig.gui_design: {}\n".format(self.gui_design)
@ -50,6 +54,7 @@ class ChatEnv:
self.config = chat_env_config
self.roster: Roster = Roster()
self.codes: Codes = Codes()
self.memory: Memory = Memory()
self.proposed_images: Dict[str, str] = {}
self.incorporated_images: Dict[str, str] = {}
self.requirements: Documents = Documents()
@ -92,6 +97,13 @@ class ChatEnv:
else:
os.mkdir(self.env_dict['directory'])
def init_memory(self):
self.memory.id_enabled = True
self.memory.directory = os.path.join(os.getcwd(),"ecl","memory")
if not os.path.exists(self.memory.directory):
os.mkdir(self.memory.directory)
self.memory.upload()
def exist_bugs(self) -> tuple[bool, str]:
directory = self.env_dict['directory']

View File

@ -59,6 +59,7 @@ class Phase(ABC):
need_reflect=False,
with_task_specify=False,
model_type=ModelType.GPT_3_5_TURBO,
memory=None,
placeholders=None,
chat_turn_limit=10
) -> str:
@ -102,6 +103,7 @@ class Phase(ABC):
task_prompt=task_prompt,
task_type=task_type,
with_task_specify=with_task_specify,
memory=memory,
model_type=model_type,
background_prompt=chat_env.config.background_prompt
)
@ -227,6 +229,7 @@ class Phase(ABC):
user_role_prompt=self.counselor_prompt,
placeholders={"conversations": messages, "question": question},
need_reflect=False,
memory=chat_env.memory,
chat_turn_limit=1,
model_type=self.model_type)
@ -300,6 +303,7 @@ class Phase(ABC):
user_role_prompt=self.user_role_prompt,
chat_turn_limit=chat_turn_limit,
placeholders=self.phase_env,
memory=chat_env.memory,
model_type=self.model_type)
chat_env = self.update_chat_env(chat_env)
return chat_env
@ -529,6 +533,7 @@ class CodeReviewHuman(Phase):
user_role_prompt=self.user_role_prompt,
chat_turn_limit=chat_turn_limit,
placeholders=self.phase_env,
memory=chat_env.memory,
model_type=self.model_type)
chat_env = self.update_chat_env(chat_env)
return chat_env
@ -579,6 +584,7 @@ class TestErrorSummary(Phase):
phase_name=self.phase_name,
assistant_role_prompt=self.assistant_role_prompt,
user_role_prompt=self.user_role_prompt,
memory=chat_env.memory,
chat_turn_limit=chat_turn_limit,
placeholders=self.phase_env)
chat_env = self.update_chat_env(chat_env)

163
ecl/codes.py Normal file
View File

@ -0,0 +1,163 @@
import difflib
import os
import re
import subprocess
import shutil
import time
import signal
from utils import get_easyDict_from_filepath
class Codes:
def __init__(self, generated_content=""):
cfg = get_easyDict_from_filepath("./ecl/config.yaml")
self.directory: str = cfg.codes.tmp_directory
self.main_script: str = cfg.codes.main_script
self.generated_content: str = generated_content
self.codebooks = {}
def extract_filename_from_line(lines):
file_name = ""
for candidate in re.finditer(r"(\w+\.\w+)", lines, re.DOTALL):
file_name = candidate.group()
file_name = file_name.lower()
return file_name
def extract_filename_from_code(code):
file_name = ""
regex_extract = r"class (\S+?):\n"
matches_extract = re.finditer(regex_extract, code, re.DOTALL)
for match_extract in matches_extract:
file_name = match_extract.group(1)
file_name = file_name.lower().split("(")[0] + ".py"
return file_name
if generated_content != "":
regex = r"(.+?)\n```.*?\n(.*?)```"
matches = re.finditer(regex, self.generated_content, re.DOTALL)
for match in matches:
code = match.group(2)
if "CODE" in code:
continue
group1 = match.group(1)
filename = extract_filename_from_line(group1)
if "__main__" in code:
filename = "main.py"
if filename == "": # post-processing
filename = extract_filename_from_code(code)
assert filename != ""
if filename is not None and code is not None and len(filename) > 0 and len(code) > 0:
self.codebooks[filename] = self._format_code(code)
def _format_code(self, code):
code = "\n".join([line for line in code.split("\n") if len(line.strip()) > 0])
return code
def _update_codes(self, generated_content):
new_codes = Codes(generated_content)
differ = difflib.Differ()
for key in new_codes.codebooks.keys():
if key not in self.codebooks.keys() or self.codebooks[key] != new_codes.codebooks[key]:
update_codes_content = "**[Update Codes]**\n\n"
update_codes_content += "{} updated.\n".format(key)
old_codes_content = self.codebooks[key] if key in self.codebooks.keys() else "# None"
new_codes_content = new_codes.codebooks[key]
lines_old = old_codes_content.splitlines()
lines_new = new_codes_content.splitlines()
unified_diff = difflib.unified_diff(lines_old, lines_new, lineterm='', fromfile='Old', tofile='New')
unified_diff = '\n'.join(unified_diff)
update_codes_content = update_codes_content + "\n\n" + """```
'''
'''\n""" + unified_diff + "\n```"
self.codebooks[key] = new_codes.codebooks[key]
def _rewrite_codes(self) -> None:
directory = self.directory
rewrite_codes_content = "**[Rewrite Codes]**\n"
if os.path.exists(directory):
shutil.rmtree(self.directory)
if not os.path.exists(directory):
os.mkdir(self.directory)
rewrite_codes_content += "{} Created\n".format(directory)
for filename in self.codebooks.keys():
filepath = os.path.join(directory, filename)
with open(filepath, "w", encoding="utf-8") as writer:
writer.write(self.codebooks[filename])
rewrite_codes_content += os.path.join(directory, filename) + " Wrote\n"
# print(rewrite_codes_content)
def _run_codes(self) -> None:
directory = os.path.abspath(self.directory)
if self.main_script not in os.listdir(directory):
return False, "{} Not Found".format(self.main_script)
success_info = "The software run successfully without errors."
try:
# check if we are on windows or linux
if os.name == 'nt':
command = "cd {} && dir && python {}".format(directory, self.main_script)
process = subprocess.Popen(
command,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
)
else:
command = "cd {}; ls -l; python3 {};".format(directory, self.main_script)
process = subprocess.Popen(command,
shell=True,
preexec_fn=os.setsid,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
time.sleep(3)
return_code = process.returncode
# Check if the software is still running
if process.poll() is None:
if "killpg" in dir(os):
os.killpg(os.getpgid(process.pid), signal.SIGTERM)
else:
os.kill(process.pid, signal.SIGTERM)
if process.poll() is None:
os.kill(process.pid, signal.CTRL_BREAK_EVENT)
if return_code == 0:
return False, success_info
else:
error_output = process.stderr.read().decode('utf-8')
if error_output:
if "Traceback".lower() in error_output.lower():
errs = error_output.replace(directory + "/", "")
return True, errs
else:
return False, success_info
except subprocess.CalledProcessError as e:
return True, f"Error: {e}"
except Exception as ex:
return True, f"An error occurred: {ex}"
return False, success_info
def _get_codes(self) -> str:
content = ""
for filename in self.codebooks.keys():
content += "{}\n```{}\n{}\n```\n\n".format(filename,
"python" if filename.endswith(".py") else filename.split(".")[
-1], self.codebooks[filename])
return content
def _load_from_hardware(self, directory) -> None:
assert len([filename for filename in os.listdir(directory) if filename.endswith(".py")]) > 0
for root, directories, filenames in os.walk(directory):
for filename in filenames:
if filename.endswith(".py"):
code = open(os.path.join(directory, filename), "r", encoding="utf-8").read()
self.codebooks[filename] = self._format_code(code)
print("{} files read from {}".format(len(self.codebooks.keys()), directory))

17
ecl/config.yaml Normal file
View File

@ -0,0 +1,17 @@
experience:
reap_zombie: True
threshold: 0
upper_limit: 10
codes:
tmp_directory: "tmp_codes"
main_script: "main.py"
embedding_method: "OpenAI"
retrieval:
top_k_code: 1 # top k target code
top_k_text: 1 # top k instructionstar
searchcode_thresh: 0 # similarity threshold between text query and instructionstar, search for targetcode
searchtext_thresh: 0 # similarity threshold between code query and sourcecode, search for instructionstar

69
ecl/ecl.py Normal file
View File

@ -0,0 +1,69 @@
import argparse
from graph import Graph
from experience import Experience
from utils import get_easyDict_from_filepath,now ,log_and_print_online
from memory import Memory
import sys
import os
import logging
sys.path.append(os.path.join(os.getcwd(),"ecl"))
def memorize(directory):
print(directory)
cfg = get_easyDict_from_filepath("./ecl/config.yaml")
folder_path = "ecl/logs"
if not os.path.exists(folder_path):
os.mkdir(folder_path)
log_filename = folder_path+"/ecl_{}.log".format(os.path.basename(directory))
print(log_filename)
root_logger = logging.getLogger()
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
file_handler = logging.FileHandler(log_filename, mode='w', encoding='utf-8')
formatter = logging.Formatter('[%(asctime)s %(levelname)s] %(message)s', datefmt='%Y-%d-%m %H:%M:%S')
file_handler.setFormatter(formatter)
root_logger.addHandler(file_handler)
root_logger.setLevel(logging.INFO)
log_and_print_online("[Config]:"+str(cfg))
graph = Graph()
graph.create_from_log(directory)
graph.print()
experience = Experience(graph, directory)
if len(graph.nodes)==0 or len(graph.edges) == 0:
log_and_print_online("No node or no edges constrcuted from the task execution process, maybe due to a unfinished software production or sometimes single node appears")
else:
if cfg.experience.reap_zombie:
experience.reap_zombie()
graph.print()
experience.estimate()
experiences = experience.extract_thresholded_experiences()
# memory upload
memory = Memory()
memory.upload()
memory.upload_from_experience(experience)
def process_directory(directory):
for root, dirs, files in os.walk(directory):
for directory in dirs:
file_path = os.path.join(root, directory)
memorize(file_path)
def main():
parser = argparse.ArgumentParser(description="Memorize one software or softwares from the directory.")
parser.add_argument("path", help="The file or directory to process")
parser.add_argument("-d", "--directory", action="store_true", help="Process all files in the given directory.")
args = parser.parse_args()
if args.directory:
process_directory(args.path)
else:
memorize(args.path)
if __name__ == "__main__":
main()

84
ecl/embedding.py Normal file
View File

@ -0,0 +1,84 @@
import os
import openai
from openai import OpenAI
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
if 'BASE_URL' in os.environ:
BASE_URL = os.environ['BASE_URL']
else:
BASE_URL = None
import sys
import time
from tenacity import (
retry,
stop_after_attempt,
wait_random_exponential,
wait_fixed
)
from utils import log_and_print_online
sys.path.append(os.path.join(os.getcwd(),"ecl"))
class OpenAIEmbedding:
def __init__(self, **params):
self.code_prompt_tokens = 0
self.text_prompt_tokens = 0
self.code_total_tokens = 0
self.text_total_tokens = 0
self.prompt_tokens = 0
self.total_tokens = 0
@retry(wait=wait_random_exponential(min=2, max=5), stop=stop_after_attempt(10))
def get_text_embedding(self,text: str):
if BASE_URL:
client = openai.OpenAI(
api_key=OPENAI_API_KEY,
base_url=BASE_URL,
)
else:
client = openai.OpenAI(
api_key=OPENAI_API_KEY
)
if len(text)>8191:
text = text[:8190]
response = client.embeddings.create(input = text, model="text-embedding-ada-002").model_dump()
embedding = response['data'][0]['embedding']
log_and_print_online(
"Get text embedding from {}:\n**[OpenAI_Usage_Info Receive]**\nprompt_tokens: {}\ntotal_tokens: {}\n".format(
response["model"],response["usage"]["prompt_tokens"],response["usage"]["total_tokens"]))
self.text_prompt_tokens += response["usage"]["prompt_tokens"]
self.text_total_tokens += response["usage"]["total_tokens"]
self.prompt_tokens += response["usage"]["prompt_tokens"]
self.total_tokens += response["usage"]["total_tokens"]
return embedding
@retry(wait=wait_random_exponential(min=10, max=60), stop=stop_after_attempt(10))
def get_code_embedding(self,code: str):
if BASE_URL:
client = openai.OpenAI(
api_key=OPENAI_API_KEY,
base_url=BASE_URL,
)
else:
client = openai.OpenAI(
api_key=OPENAI_API_KEY
)
if len(code) == 0:
code = "#"
elif len(code) >8191:
code = code[0:8190]
response = client.embeddings.create(input=code, model="text-embedding-ada-002").model_dump()
embedding = response['data'][0]['embedding']
log_and_print_online(
"Get code embedding from {}:\n**[OpenAI_Usage_Info Receive]**\nprompt_tokens: {}\ntotal_tokens: {}\n".format(
response["model"],response["usage"]["prompt_tokens"],response["usage"]["total_tokens"]))
self.code_prompt_tokens += response["usage"]["prompt_tokens"]
self.code_total_tokens += response["usage"]["total_tokens"]
self.prompt_tokens += response["usage"]["prompt_tokens"]
self.total_tokens += response["usage"]["total_tokens"]
return embedding

311
ecl/experience.py Normal file
View File

@ -0,0 +1,311 @@
import os
import time
from graph import Graph, Node, Edge
import sys
import openai
import numpy as np
from codes import Codes
from utils import get_easyDict_from_filepath,OpenAIModel,log_and_print_online
from embedding import OpenAIEmbedding
sys.path.append(os.path.join(os.getcwd(),"ecl"))
class Shortcut:
def __init__(self, sourceMID, targetMID, valueGain,instructionStar,edgeIDPath):
self.sourceMID = sourceMID
self.targetMID = targetMID
self.valueGain = valueGain
self.embedding = None
self.instructionStar = instructionStar
self.edgeIDPath = edgeIDPath
def __str__(self):
return "{} -> {} valueGain={:.6f} len(instructionPath)={} instructionStar={}".format(self.sourceMID, self.targetMID, self.valueGain, len(self.edgeIDPath), self.instructionStar[:100].replace("\n", ""))
class Experience:
def __init__(self, graph: Graph, directory: str):
cfg = get_easyDict_from_filepath("./ecl/config.yaml")
self.graph: Graph = graph
self.directory = directory
self.threshold = cfg.experience.threshold
self.upperLimit = cfg.experience.upper_limit
self.experiences = []
self.model = OpenAIModel(model_type="gpt-3.5-turbo-16k")
self.embedding_method = OpenAIEmbedding()
for edge in self.graph.edges:
node = self.graph.nodes[edge.targetMID]
node.degree += 1
assert len(self.graph.edges) * 1 == sum([self.graph.nodes[mid].degree for mid in self.graph.nodes.keys()]) # unidirectional
for mid in self.graph.nodes.keys():
node = self.graph.nodes[mid]
node.value = 1.0
def reap_zombie(self):
pathNodes, pathEdges = self.graph.find_shortest_path()
zombieEdges = [edge for edge in self.graph.edges if edge not in pathEdges]
zombieNodes = [self.graph.nodes[mid] for mid in self.graph.nodes.keys() if mid not in pathNodes]
log_zombieedges = "ZOMBIE EDGES: \n"
log_zombienodes = "ZOMBIE NODES: \n"
for edge in zombieEdges:
self.graph.edges.remove(edge)
log_zombieedges += "Zombie Edge {} -> {} Removed\n".format(edge.sourceMID, edge.targetMID)
log_and_print_online(log_zombieedges)
for node in zombieNodes:
del self.graph.nodes[node.mID]
log_zombienodes += "Zombie Node {} Removed\n".format(node.mID)
log_and_print_online(log_zombienodes)
def estimate(self):
if len(self.graph.edges) == 0:
return
for mid in self.graph.nodes.keys():
node = self.graph.nodes[mid]
if len(node.code) == 0:
node.value *= 0.0
log_and_print_online()
vn = self.graph.nodes[self.graph.edges[-1].targetMID]
# print(vn.mID, "...")
for mid in self.graph.nodes.keys():
# print(mid)
vi = self.graph.nodes[mid]
vi.value = self._pairwise_estimate(vi, vn)
log_and_print_online("Init value:"+ str({mid: self.graph.nodes[mid].value for mid in self.graph.nodes.keys()})+"\n\nEstimated value:"+str({mid: self.graph.nodes[mid].value for mid in self.graph.nodes.keys()}))
def get_cosine_similarity(self, embeddingi, embeddingj):
embeddingi = np.array(embeddingi)
embeddingj = np.array(embeddingj)
cos_sim = embeddingi.dot(embeddingj) / (np.linalg.norm(embeddingi) * np.linalg.norm(embeddingj))
return cos_sim
def _pairwise_estimate(self, vi: Node, vj: Node):
if vi.value == 0.0:
return 0.0
pathNodes, pathEdges = self.graph.find_shortest_path(vi.mID, vj.mID)
distance_weight = 1.0 / len(pathEdges) if len(pathEdges) != 0 else 1.0
codes = Codes(vi.code)
codes._rewrite_codes()
(exist_bugs_flag, test_reports) = codes._run_codes()
compile_weight = 0.0 if exist_bugs_flag else 1.0
if compile_weight == 0.0:
return 0.0
maximum_degree = max([self.graph.nodes[mid].degree for mid in self.graph.nodes.keys()])
degree_weight = vi.degree * 1.0 / maximum_degree
if degree_weight == 0.0:
return 0.0
start_time = time.time()
vi_code_emb = self.embedding_method.get_code_embedding(vi.code) if vi.embedding is None else vi.embedding
if vi.embedding is None:
end_time =time.time()
log_and_print_online("DONE:get node embedding\ntime cost:{}\n".format(end_time-start_time))
vi.embedding = vi_code_emb
start_time = time.time()
vj_code_emb = self.embedding_method.get_code_embedding(vj.code) if vj.embedding is None else vj.embedding
if vj.embedding is None:
end_time =time.time()
log_and_print_online("DONE:get node embedding\ntime cost:{}\n".format(end_time-start_time))
vj.embedding = vj_code_emb
code_code_cos_sim = self.get_cosine_similarity(vi_code_emb, vj_code_emb)
if code_code_cos_sim == 0.0:
return 0.0
filenames = os.listdir(self.directory)
filename = [filename for filename in filenames if filename.endswith(".prompt")][0]
task_prompt = open(os.path.join(self.directory, filename), "r").read().strip()
start_time = time.time()
task_emb = self.embedding_method.get_text_embedding(task_prompt) if self.graph.task_embedding is None else self.graph.task_embedding
if self.graph.task_embedding is None:
end_time =time.time()
log_and_print_online("DONE:get task prompt embedding\ntime cost:{}\n".format(end_time-start_time))
self.graph.task = task_prompt
self.graph.task_embedding = task_emb
code_text_cos_sim = self.get_cosine_similarity(vi_code_emb, task_emb)
if code_text_cos_sim == 0.0:
return 0.0
assert distance_weight >= 0.0 and distance_weight <= 1.0
assert compile_weight >= 0.0 and compile_weight <= 1.0
assert degree_weight >= 0.0 and degree_weight <= 1.0
distance = vj.version - vi.version
if distance == 0:
return 1
else:
return code_code_cos_sim * 1.0 / distance * code_text_cos_sim * compile_weight * degree_weight
#return distance_weight * compile_weight * degree_weight
def get_transitive_closure(self):
def print_matrix(matrix):
for nodei in matrix.keys():
for nodej in matrix.keys():
print(matrix[nodei][nodej], end=" ")
print()
print()
# Warshall Algorithm
matrix = {}
for mid1 in self.graph.nodes:
for mid2 in self.graph.nodes:
if mid1 not in matrix.keys():
matrix[mid1] = {}
matrix[mid1][mid2] = 0
# print_matrix(matrix)
pathNodes, pathEdges = self.graph.find_shortest_path()
for edge in pathEdges:
matrix[edge.sourceMID][edge.targetMID] = 1
print("Init Adjacent Matrix:")
print_matrix(matrix)
for nodek in matrix.keys():
for nodei in matrix.keys():
for nodej in matrix.keys():
if matrix[nodei][nodej] == 1 or (matrix[nodei][nodek] == 1 and matrix[nodek][nodej] == 1):
matrix[nodei][nodej] = 1
print("Transitive Closure:")
print_matrix(matrix)
return matrix
def extract_thresholded_experiences(self):
if len(self.graph.edges) == 0:
return []
if len(self.graph.nodes) < 2:
return []
assert len(self.graph.nodes.keys()) >= 2
matrix = self.get_transitive_closure()
experiences = []
pathNodes, _ = self.graph.find_shortest_path()
for id1 in pathNodes:
for id2 in pathNodes:
valueGain = self.graph.nodes[id2].value - self.graph.nodes[id1].value
flag0 = id1 != id2
flag1 = self.graph.exists_edge(id1, id2) == False
flag2 = matrix[id1][id2] == 1
flag3 = valueGain >= self.threshold
code_lines = [line.lower().strip() for line in self.graph.nodes[id2].code.split("\n")]
flag4 = not ("pass".lower() in code_lines or "TODO".lower() in code_lines)
if flag0 and flag1 and flag2 and flag3 and flag4:
_, edges = self.graph.find_shortest_path(uMID=id1, vMID=id2)
edgeIDPath = [edge.edgeId for edge in edges]
sourcecode=self.graph.nodes[id1].code
targetcode=self.graph.nodes[id2].code
shortcut = Shortcut(sourceMID=id1, targetMID=id2, valueGain=valueGain,instructionStar="", edgeIDPath=edgeIDPath)
experiences.append(shortcut)
experiences = sorted(experiences, key=lambda item: item.valueGain, reverse = True)
if len(experiences) > self.upperLimit:
log_and_print_online("{} experieces truncated.".format(len(experiences) - self.upperLimit))
experiences = experiences[:self.upperLimit]
prompt_template0 = """Provide detailed instructions to generate the following code:
{targetcode}
The instructions should encompass:
Modules and Classes:
- Enumerate necessary modules.
- Detail the classes, their attributes, and methods within these modules.
- Articulate the purpose and operation of each class.
Data Structures:
- Identify the requisite data structures.
- Describe their names, attributes, and operations.
Main Program Flow:
- Outline the principal progression of the program.
- Highlight the sequence for initializing and invoking other modules, classes, and methods within the primary file (e.g., main.py).
- Clarify the logical progression during runtime.
Input and Output:
- Specify the method by which the program accepts input, be it from users or external sources.
- Elaborate on the projected outputs or actions of the software.
Exception Handling:
- Instruct on the approach to manage potential anomalies or exceptions during execution to ascertain stability and robustness.
External Libraries and Dependencies:
- Explicitly list the necessary external libraries or dependencies, their versions, and their functionalities.
Please output the instructions directly."""
prompt_template1 = """Please provide detailed instructions on how to transition from the initial code version represented by source code to the final version indicated by target code.
Source Code:
{sourcecode}
Target Code:
{targetcode}
The instructions should encompass:
Modules and Classes: Detail the modules to be incorporated, along with the names, attributes, and operations of any classes to be added or amended. Furthermore, describe the intended function and utility of these new or altered classes.
Data Structures: Clearly define any data structures that need introduction or alteration, elucidating their names, attributes, and functionalities.
Main Program Flow: Outline the program's primary sequence of operations, highlighting the procedures to initialize and invoke other modules, classes, and methods in the primary file (e.g., main.py). Describe the program's logic sequence during its execution.
Input and Output: Define the methodology by which the program will acquire input, whether from users or external data sources. Also, characterize the projected outputs or behaviors of the application.
Exception Handling: Provide guidance on managing potential discrepancies or exceptions that might emerge during the software's operation, ensuring its resilience and reliability.
External Libraries and Dependencies: If the implementation requires external libraries or dependencies, specify their names, versions, and their respective purposes explicitly."""
for shortcut in experiences:
sourcecode = self.graph.nodes[shortcut.sourceMID].code
targetcode = self.graph.nodes[shortcut.targetMID].code
if sourcecode == "":
prompt = prompt_template0.replace("{targetcode}", targetcode)
response = self.model.run(messages=[{"role": "system", "content": prompt}])
print("instructionstar generated")
else:
prompt = prompt_template1.replace("{sourcecode}", sourcecode).replace("{targetcode}", targetcode)
response = self.model.run(messages=[{"role": "system", "content": prompt}])
print("instructionstar generated")
shortcut.instructionStar = response["choices"][0]["message"]["content"]
output = "Sorted-and-Truncated Experiences (with instructionStar):"
self.experiences = experiences
for experience in experiences:
output += str(experience)
log_and_print_online(output)
log_and_print_online("[Conclusion]:\nprompt_tokens:{}, completion_tokens:{}, total_tokens:{}".format(self.model.prompt_tokens,self.model.completion_tokens,self.model.total_tokens))
log_and_print_online("[Conclusion]:\ntext_prompt_tokens:{}, text_total_tokens:{}\ncode_prompt_tokens:{}, code_total_tokens:{}\nprompt_tokens:{}, total_tokens:{}".format(self.embedding_method.text_prompt_tokens,
self.embedding_method.text_total_tokens,
self.embedding_method.code_prompt_tokens,
self.embedding_method.code_total_tokens,
self.embedding_method.prompt_tokens,
self.embedding_method.total_tokens))
return experiences
def to_dict(self):
merged_data = []
for index, ex in enumerate(self.experiences):
merged_data.append(ex.__dict__)
return merged_data

327
ecl/graph.py Normal file
View File

@ -0,0 +1,327 @@
import os
import subprocess
import hashlib
from queue import Queue
import re
from utils import cmd,log_and_print_online
class Node:
def __init__(self):
self.code = None
self.version = None
self.commitMessage = None
self.mID = None
self.role = None
self.degree = 0
self.value = 0.0
self.embedding = None
def create_from_warehouse(self, directory) -> None:
def _format_code(code):
code = "\n".join([line for line in code.split("\n") if len(line.strip()) > 0])
return code
# Read all .py files
codebooks = {}
assert len([filename for filename in os.listdir(directory) if filename.endswith(".py")]) > 0
for root, directories, filenames in os.walk(directory):
for filename in filenames:
if filename.endswith(".py"):
codebooks[filename] = _format_code(open(os.path.join(directory, filename), "r", encoding="utf-8").read())
# Format Codes
code = ""
for filename in codebooks.keys():
filepath = os.path.join(directory, filename)
code += "{}\n```Python\n{}\n```\n\n".format(filename, codebooks[filename])
self.code = code
self.mID = hashlib.md5(self.code.encode(encoding='UTF-8')).hexdigest()
content = cmd("cd {} && git log --oneline".format(directory)).replace("(HEAD -> main)", "").replace(" ", " ")
self.commitMessage = " ".join(content.split("\n")[0].split(" ")[1:])
self.version = float(content.split("\n")[0].split(" ")[1].replace("v", ""))
class Edge:
def __init__(self, sourceMID, targetMID, instruction, role):
self.sourceMID = sourceMID
self.targetMID = targetMID
self.instruction = instruction
self.role = role
self.edgeId = None
self.embedding = None
class Graph:
def __init__(self):
self.task = ""
self.task_embedding = None
self.nodes = {}
self.edges = []
self.directory:str = None
def addNode(self, node: Node):
if node.mID not in self.nodes.keys():
self.nodes[node.mID] = node
def addEdge(self, edge: Edge):
num = "edge_{}".format(len(self.edges))
edge.edgeId = hashlib.md5(num.encode(encoding='UTF-8')).hexdigest()
self.edges.append(edge)
def exists_edge(self, mid1: str, mid2: str):
for edge in self.edges:
if edge.sourceMID == mid1 and edge.targetMID == mid2:
return True
return False
def create_from_warehouse(self, directory) -> None:
self.directory = directory
content = cmd("cd {} && git log --oneline".format(directory))
#assert "log commit" in content
cIDs = ["0" * 7] + [line.split(" ")[0] for line in content.split("\n") if len(line)>0][::-1] # Commit IDs
log_cID = cIDs[-1]
cIDs = cIDs[:-1]
log_and_print_online("commit history:"+ str(cIDs)+ "\nlog commit:"+ str(log_cID))
# Commit ID -> md5 ID
# Constructing Nodes
try:
cID2mID = {}
output = ""
for cID in cIDs:
if cID == "0" * 7:
node = Node()
node.code = ""
node.mID = hashlib.md5("".encode(encoding='UTF-8')).hexdigest()
node.commitMessage = ""
node.version = "v0.0"
cID2mID[cID] = node.mID
self.addNode(node)
output += ("Node: {} -> {}\n".format("0" * 7, node.mID))
else:
content = cmd("cd {} && git reset --hard {}".format(directory, cID))
node = Node()
node.create_from_warehouse(directory)
cID2mID[cID] = node.mID
self.addNode(node)
output += ("Node: {} -> {}\n".format(cID, node.mID))
finally:
cmd("cd {} && git reset --hard {}".format(directory, log_cID))
log_and_print_online(output)
# Constructing Edges
for i in range(1, len(cIDs), 1):
sourceCID = cIDs[i-1]
targetCID = cIDs[i]
sourceMID = cID2mID[sourceCID]
targetMID = cID2mID[targetCID]
edge = Edge(sourceMID, targetMID, instruction="", role="")
self.addEdge(edge)
# print("{} -> {}, {} -> {}".format(sourcecID, targetcID, sourcemID, targetmID))
self._create_instruction_and_roles_from_log(directory)
def create_from_log(self, directory) -> None:
def update_codebook(utterance, codebook):
def extract_filename_from_line(lines):
file_name = ""
for candidate in re.finditer(r"(\w+\.\w+)", lines, re.DOTALL):
file_name = candidate.group()
file_name = file_name.lower()
return file_name
def extract_filename_from_code(code):
file_name = ""
regex_extract = r"class (\S+?):\n"
matches_extract = re.finditer(regex_extract, code, re.DOTALL)
for match_extract in matches_extract:
file_name = match_extract.group(1)
file_name = file_name.lower().split("(")[0] + ".py"
return file_name
def _format_code(code):
code = "\n".join([line for line in code.split("\n") if len(line.strip()) > 0])
return code
regex = r"(.+?)\n```.*?\n(.*?)```"
matches = re.finditer(regex, utterance, re.DOTALL)
for match in matches:
code = match.group(2)
if "CODE" in code:
continue
group1 = match.group(1)
filename = extract_filename_from_line(group1)
if "__main__" in code:
filename = "main.py"
if filename == "":
filename = extract_filename_from_code(code)
assert filename != ""
if filename is not None and code is not None and len(filename) > 0 and len(code) > 0:
codebook[filename] = _format_code(code)
def get_codes(codebook):
content = ""
for filename in codebook.keys():
content += "{}\n```{}\n{}\n```\n\n".format(filename, "python" if filename.endswith(".py") else
filename.split(".")[-1], codebook[filename])
return content
self.directory = directory
logdir = [filename for filename in os.listdir(directory) if filename.endswith(".log")]
if len(logdir) > 0:
log_filename = logdir[0]
print("log_filename:", log_filename)
else:
return
content = open(os.path.join(directory, log_filename), "r", encoding='UTF-8').read()
utterances = []
regex = r"\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} \w+)\] ([.\s\S\n\r\d\D\t]*?)(?=\n\[\d|$)"
matches = re.finditer(regex, content, re.DOTALL)
for match in matches:
group1 = match.group(1)
group2 = match.group(2)
utterances.append("[{}] {}".format(group1, group2))
utterances = [utterance for utterance in utterances if
"flask app.py" not in utterance and "OpenAI_Usage_Info" not in utterance]
index = [i for i, utterance in enumerate(utterances) if
"Programmer<->Chief Technology Officer on : EnvironmentDoc" in utterance]
if len(index) > 0:
utterances = utterances[:index[0] - 1]
utterances_code= [utterance for utterance in utterances if
"Programmer<->" in utterance and "EnvironmentDoc" not in utterance and "TestErrorSummary" not in utterance]
print("len(utterances_code):", len(utterances_code))
codebook, fingerprints, pre_mid = {}, set(), ""
for utterance in utterances_code:
update_codebook(utterance, codebook)
# construct node
node = Node()
node.mID = hashlib.md5(get_codes(codebook).encode(encoding='UTF-8')).hexdigest()
node.commitMessage = ""
node.code = get_codes(codebook)
node.version = float(len(fingerprints))
if node.mID not in fingerprints:
fingerprints.add(node.mID)
self.addNode(node)
# construct edge
if pre_mid != "":
sourceMID = pre_mid
targetMID = node.mID
edge = Edge(sourceMID, targetMID, instruction="", role="")
self.addEdge(edge)
pre_mid = node.mID
self._create_instruction_and_roles_from_log(directory)
def _create_instruction_and_roles_from_log(self, directory) -> None:
logdir = [filename for filename in os.listdir(directory) if filename.endswith(".log")]
if len(logdir)>0:
log_filename = logdir[0]
log_and_print_online("log_filename:"+log_filename)
else :
return
content = open(os.path.join(directory, log_filename), "r", encoding='UTF-8').read()
utterances = []
regex = r"\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} \w+)\] ([.\s\S\n\r\d\D\t]*?)(?=\n\[\d|$)"
matches = re.finditer(regex, content, re.DOTALL)
for match in matches:
group1 = match.group(1)
group2 = match.group(2)
# print(group1)
# print(group2)
utterances.append(group2)
# print()
utterances = [utterance for utterance in utterances if "Chief Technology Officer: **[Start Chat]**" in utterance or "Code Reviewer: **[Start Chat]**" in utterance or "Software Test Engineer: **[Start Chat]**" in utterance]
if "Test Pass!" in content:
utterances.append("Software Test Engineer: **[Start Chat]**\n\nTest Pass!")
instructions, roles = [], []
for utterance in utterances:
utterance = utterance.lower()
instruction = ""
if "Chief Technology Officer: **[Start Chat]**".lower() in utterance:
instruction = "write one or multiple files and make sure that every detail of the architecture is implemented as code"
elif "Code Reviewer: **[Start Chat]**".lower() in utterance:
instruction = utterance.split("Comments on Codes:".lower())[-1].split("In the software,".lower())[0]
instruction = instruction.replace("<comment>".lower(), "")
elif "Software Test Engineer: **[Start Chat]**".lower() in utterance:
if "Test Pass!".lower() in utterance:
instruction = "Test Pass!"
else:
instruction = utterance.split("Error Summary of Test Reports:".lower())[-1].split("Note that each file must strictly follow a markdown code block format".lower())[0]
else:
assert False
role = utterance.split(": **")[0]
instruction = instruction.strip()
if instruction.startswith("\""):
instruction = instruction[1:]
if instruction.endswith("\""):
instruction = instruction[:-1]
instruction = instruction.strip()
instructions.append(instruction)
role = role.strip()
roles.append(role)
for i in range(len(self.edges)):
self.edges[i].instruction = instructions[i]
self.edges[i].role = roles[i]
def find_shortest_path(self, uMID=None, vMID=None):
if uMID == None:
uMID = self.edges[0].sourceMID
if vMID == None:
vMID = self.edges[-1].targetMID
Q, visit, preMID, preEdge = Queue(), {}, {}, {}
Q.put(uMID)
visit[uMID] = True
while not Q.empty():
mID = Q.get()
if mID == vMID:
id, pathNodes, pathEdges = vMID, [], []
while id != uMID:
pathNodes.append(id)
pathEdges.append(preEdge[id])
id = preMID[id]
pathNodes.append(uMID)
pathNodes = pathNodes[::-1]
pathEdges = pathEdges[::-1]
return pathNodes, pathEdges
nextMIDs = [edge.targetMID for edge in self.edges if edge.sourceMID == mID]
nextEdges = [edge for edge in self.edges if edge.sourceMID == mID]
for i in range(len(nextMIDs)):
nextMID = nextMIDs[i]
nextEdge = nextEdges[i]
if nextMID not in visit.keys():
Q.put(nextMID)
visit[nextMID] = True
preMID[nextMID] = mID
preEdge[nextMID] = nextEdge
def print(self):
output = "\n"+"*" * 50 + " Graph " + "*" * 50 + "\n"
output += "{} Nodes:\n".format(len(self.nodes.keys()))
for key in self.nodes.keys():
node = self.nodes[key]
output += "{}, {}, {}\n".format(node.mID, node.version, node.commitMessage)
output += "{} Edges:\n".format(len(self.edges))
for edge in self.edges:
output += "{}: {} -> {} ({}: {})\n".format(edge.edgeId, edge.sourceMID, edge.targetMID, edge.role, edge.instruction[:60])
output += "*" * 50 + " Graph " + "*" * 50
log_and_print_online(output)
def to_dict(self):
merged_node_dict = []
merged_edge_dict = []
for k,v in self.nodes.items():
merged_node_dict.append(v.__dict__)
for index,e in enumerate(self.edges):
merged_edge_dict.append(e.__dict__ )
return merged_node_dict,merged_edge_dict

430
ecl/memory.py Normal file
View File

@ -0,0 +1,430 @@
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
from abc import ABC, abstractmethod
import json
import time
import math
import os
import sys
import openai
import faiss
import numpy as np
from datetime import datetime
sys.path.append(os.path.join(os.getcwd(),"ecl"))
#from utils import get_code_embedding,get_text_embedding
from utils import get_easyDict_from_filepath,log_and_print_online
from embedding import OpenAIEmbedding
class MemoryBase(ABC):
def __init__(self, directory: str) -> None:
self.directory: str = directory
cfg = get_easyDict_from_filepath("./ecl/config.yaml")
self.top_k_code = cfg.retrieval.top_k_code
self.top_k_text = cfg.retrieval.top_k_text
self.code_thresh = cfg.retrieval.searchcode_thresh
self.text_thresh = cfg.retrieval.searchtext_thresh
self.embedding_method = None
if cfg.embedding_method == "OpenAI":
self.embedding_method = OpenAIEmbedding()
self.content = None
if os.path.exists(self.directory) and self.directory.endswith('.json'):
with open(self.directory) as file:
self.content = json.load(file)
elif os.path.exists(self.directory) is False:
with open(self.directory, 'w') as file:
json.dump({}, file) # Create an empty JSON file
file.close()
print(f"Now the memory file '{self.directory}' is created")
if self.content is None:
print("Empty Memory")
@abstractmethod
def memory_retrieval(self) -> str:
pass
def _get_memory_count(self) ->int:
if isinstance(self.content,list):
return self.content[-1].get("total")
else:
return 0
class AllMemory(MemoryBase):
def __init__(self, directory: str):
super().__init__(directory)
# unused; init experience list
def _init_explist(self):
self.exp_list = None
if self.content == None:
self.exp_list = None
else :
for t in self.content:
for experience in t.get("experineces"):
self.exp_list.append(experience)
# clear all memory
def _memory_clear(self) ->None:
if os.path.exists(self.directory) and self.directory.endswith('.json'):
with open(self.directory) as file:
json.dump({},file)
file.close()
self.content = None
# get code sample
def get_codesample(self) ->str:
if self._get_memory_count() >=1:
return self.content[-1].get("nodes")[-1]["code"]
else:
return None
# get text str sample
def get_textsample(self) ->str:
if self._get_memory_count() >=1:
return self.content[-1].get("edges")[-1].get("instruction")
else:
return None
# get code embedding from code mID
def _get_codeembedding(self,mid) :
for t in self.content:
for node in t["nodes"]:
if node["mID"] == mid:
return node.get("embedding")
# get instructionstar from sourcecode mID
def _get_instructionstar(self,mid):
max_valueGain = -1
for t in self.content:
for experience in t["experiences"]:
if experience == None :
pass
elif experience["sourceMID"] == mid:
if experience.get("valueGain") >= max_valueGain:
instructionstar = experience.get("instructionStar")
return instructionstar
# get experience task and dir from sourcecode mID
def _get_task_from_source(self,mid):
task = None
task_dir = None
for t in self.content:
for experience in t["experiences"]:
if experience == None :
pass
elif experience["sourceMID"] == mid:
task = t["task"]
task_dir = t["dir"]
return task,task_dir
# get experience task and dir from targetcode mID
def _get_task_from_target(self,mid):
task = None
task_dir = None
for t in self.content:
for experience in t["experiences"]:
if experience == None :
pass
elif experience["targetMID"] == mid:
task = t["task"]
task_dir = t["dir"]
return task,task_dir
# retrieval from MemoryCards
def memory_retrieval(self,input_message:str, type:str, k = None) :
if k == None:
if type == "code":
return self.search_code(input_message,self.top_k_code)
elif type == "text":
return self.search_text(input_message,self.top_k_text)
else:
return None
else:
if type == "code":
return self.search_code(input_message, k)
elif type == "text":
return self.search_text(input_message, k)
else:
return None
def search_text(self, code_query, k:int):
"""
search instructionStar from a code query
Keyword arguments:
code_query -- code input
k -- the number of instructions to search
Return:
(best k instructionStar, k)
"""
if self._get_memory_count() == 0 or code_query == None or k == 0:
return None
else :
code_query = self.embedding_method.get_code_embedding(code_query)
if isinstance(code_query,list):
code_query=np.array(code_query,dtype=np.float32)
code_query = code_query.reshape(1,-1)
sourcecodemid_list = []# source code mid
code_embeddings = []# code embedding
for t in self.content :
for experience in t["experiences"]:
sourcecodemid_list.append(experience.get("sourceMID"))
sourcecodemid_list = list(set(sourcecodemid_list))# remove duplicates
for mid in sourcecodemid_list:
code_embeddings.append(self._get_codeembedding(mid))
code_embedding_data = np.array(code_embeddings, dtype=np.float32)
faiss.normalize_L2(code_embedding_data)
faiss.normalize_L2(code_query)
# use L2 distance(cosine distance)
index = faiss.IndexFlatL2(code_embedding_data.shape[1])
index.add(code_embedding_data)
# In Faiss, the index.search function returns the square of L2 distance by default (Squared L2 Distance)
distances, indices = index.search(code_query, k)
similarities = 1-(1/2)*distances
task_list = []
task_dir_list = []
instructionStar_list = []
sourceMIDS = []
for i in range(k):
index = indices[0][i]
similarity = similarities[0][i]
if index != -1 and similarity >= self.text_thresh:
task, task_dir = self._get_task_from_source(sourcecodemid_list[index])
sourceMIDS.append(sourcecodemid_list[index])
task_list.append(task)
task_dir_list.append(task_dir)
instructionStar_list.append(self._get_instructionstar(sourcecodemid_list[index]))
filtered_similarities = np.array2string(similarities[:,:k])
return instructionStar_list, filtered_similarities, sourceMIDS, task_list, task_dir_list
def search_code(self, text_query, k:int):
"""search best code from a text query
Keyword arguments:
text_query -- text input
k -- the number of code to search
Return: (best k code, k)
"""
if self._get_memory_count() == 0 or text_query == None or k == 0:
return None
else :
text_query = self.embedding_method.get_text_embedding(text_query)
if isinstance(text_query,list):
text_query=np.array(text_query,dtype=np.float32)
text_query = text_query.reshape(1,-1)
text_embeddings = [exp.get("embedding") for t in self.content for exp in t["experiences"]]
text_embedding_data = np.array(text_embeddings, dtype=np.float32)
faiss.normalize_L2(text_embedding_data)
faiss.normalize_L2(text_query)
# use L2 distance(cosine distance)
total_instructionStar = text_embedding_data.shape[0]
index = faiss.IndexFlatL2(text_embedding_data.shape[1])
index.add(text_embedding_data)
# In Faiss, the index.search function returns the square of L2 distance by default (Squared L2 Distance)
distances, indices = index.search(text_query, total_instructionStar)
similarities = 1-(1/2)*distances
code_node_list = [node for t in self.content for node in t["nodes"]]
targetMIDs = []
target_code = []
task_list = []
task_dir_list = []
filtered_similarities = []
experience_list = [experience for t in self.content for experience in t["experiences"]]
counter = 0
added_set = set()
for i in range(total_instructionStar):
index = indices[0][i]
similarity = similarities[0][i]
if index != -1 and counter < k:
if similarity <= self.code_thresh:
break
else:
mid = experience_list[index].get("targetMID")
if mid not in added_set:
targetMIDs.append(mid)
added_set.add(mid)
counter += 1
filtered_similarities.append(str(similarity))
else:
break
for targetMID in targetMIDs:
for code_node in code_node_list:
if targetMID == code_node.get("mID"):
target_code.append(code_node.get("code"))
task, task_dir = self._get_task_from_target(targetMID)
task_list.append(task)
task_dir_list.append(task_dir)
filtered_similarities = ",".join(filtered_similarities)
return target_code, filtered_similarities, targetMIDs, task_list, task_dir_list
class Memory:
def __init__(self):
self.directory: str = None
self.id_enabled : bool = False
self.user_memory_filepath: str = None
self.assistant_memory_filepath: str = None
self.update_count = 0
self.memory_keys: List[str] = ["All"]
self.memory_data = {}
def __str__(self) -> str:
if self.memory_data.get("All") == None:
return "No existed memory"
else:
return "Current memory length:{}".format(self.memory_data["All"]._get_memory_count())
def _set_embedding(self,experience):
graph = experience.graph
edge_start_time = time.time()
for edge in graph.edges:
if edge.embedding is None:
start_time =time.time()
edge.embedding = self.memory_data["All"].embedding_method.get_text_embedding(edge.instruction)
end_time = time.time()
log_and_print_online("DONE: get edge embedding\ntime cost:{}\n".format(end_time-start_time))
edge_duration = time.time() - edge_start_time
log_and_print_online("DONE: got all EDGE embeddings\nEDGE embedding time cost:{}\n".format(edge_duration))
node_start_time = time.time()
for node_id in graph.nodes:
node = graph.nodes[node_id]
if node.embedding is None:
start_time = time.time()
node.embedding = self.memory_data["All"].embedding_method.get_code_embedding(node.code)
end_time = time.time()
log_and_print_online("DONE: get node embedding\ntime cost:{}\n".format(end_time-start_time))
node_duration = ( time.time() - node_start_time)
log_and_print_online("DONE: got all NODE embeddings\nNODE embedding time cost:{}\n".format(node_duration))
exp_start_time = time.time()
for exp in experience.experiences:
if exp.embedding is None:
start_time = time.time()
exp.embedding = self.memory_data["All"].embedding_method.get_text_embedding(exp.instructionStar)
end_time = time.time()
log_and_print_online("DONE: get exprience embedding\ntime cost:{}\n".format(end_time-start_time))
exp_duration = ( time.time() - exp_start_time)
log_and_print_online("DONE: got all EXPERIENCE embeddings\nEXPERIENCE embedding time cost:{}\n".format(exp_duration))
duration = edge_duration + node_duration + exp_duration
log_and_print_online("All embedding DONE\ntime cost:{}\n".format(duration))
# create memory path and upload memory from existed memory
def upload(self):
self.directory = os.path.join(os.getcwd(),"ecl","memory")
if os.path.exists(self.directory) is False:
os.mkdir(self.directory)
for key in self.memory_keys:
if key =="All":
path = os.path.join(self.directory,"MemoryCards.json")
self.memory_data[key] = AllMemory(path)
# upload experience into memory
def upload_from_experience(self, experience):
self._set_embedding(experience)
with open(self.memory_data["All"].directory, 'w') as file:
node_data,edge_data = experience.graph.to_dict()
experience_data = experience.to_dict()
merged_dic = []
index = 0
previous_memory = []
if self.memory_data["All"].content != None and len(self.memory_data["All"].content) != 0 :
previous_memory = self.memory_data["All"].content
log_and_print_online("len(previous_memory)={}".format(len(previous_memory)))
if len(previous_memory) != 0 and isinstance(previous_memory,list):
for index,t in enumerate(previous_memory):
if isinstance(t,list):
for subindex,subt in enumerate(t):
if len(subt)!=0:
merged_dic.append(subt)
elif len(t)!=0 :
merged_dic.append(t)
index = merged_dic[-1]["total"]
elif len(previous_memory) != 0 :
merged_dic.append(previous_memory)
index = 1
# remove duplication
dirList = [t["dir"] for t in merged_dic]
combined_json_str = {}
combined_json_str["index"] = index
combined_json_str["dir"] = experience.graph.directory
combined_json_str["task"] = experience.graph.task
combined_json_str["nodes"] = node_data
combined_json_str["edges"] = edge_data
combined_json_str["experiences"] = experience_data
combined_json_str["total"] = combined_json_str["index"]+1
if self.memory_data["All"].content != None and len(self.memory_data["All"].content)!=0:
merged_dic.append(combined_json_str)
else :
merged_dic.append(combined_json_str)
json.dump(merged_dic, file)
log_and_print_online("len(merged_dic)={}".format(len(merged_dic))+"\n merged_dic dumped to {}".format(self.memory_data["All"].directory))
log_and_print_online("[Conclusion]:\ntext_prompt_tokens:{}, text_total_tokens:{}\ncode_prompt_tokens:{}, code_total_tokens:{}\nprompt_tokens:{}, total_tokens:{}".format(self.memory_data["All"].embedding_method.text_prompt_tokens,
self.memory_data["All"].embedding_method.text_total_tokens,
self.memory_data["All"].embedding_method.code_prompt_tokens,
self.memory_data["All"].embedding_method.code_total_tokens,
self.memory_data["All"].embedding_method.prompt_tokens,
self.memory_data["All"].embedding_method.total_tokens))
file.close()
# delete memory from index
def delete_memroy(self,idx:int):
with open(self.memory_data["All"].directory, 'w') as file:
merged_dic = []
index = 0
previous_memory = []
if self.memory_data["All"].content != None and len(self.memory_data["All"].content) != 0 :
previous_memory = self.memory_data["All"].content
if len(previous_memory) != 0 and isinstance(previous_memory,list):
for index,t in enumerate(previous_memory):
if isinstance(t,list):
for subindex,subt in enumerate(t):
if len(subt)!=0:
merged_dic.append(subt)
elif len(t)!=0 :
merged_dic.append(t)
index = merged_dic[-1]["total"]
elif len(previous_memory) != 0 :
merged_dic.append(previous_memory)
index = 1
if idx >= len(merged_dic):
json.dump(merged_dic,file)
else :
merged_dic.pop(idx)
json.dump(merged_dic,file)
file.close()

View File

@ -0,0 +1 @@
{}

View File

@ -0,0 +1,50 @@
import json
import os
import argparse
filter_threshold = 0.9
def filter_valuegain(directory, filtered_directory):
"""filter memory by experience's valueGain, delete experience whose valueGain is smaller than filter_threshold
Keyword arguments:
directory -- the input directory of MemoryCards, like "./ecl/memory/MemoryCards.json"
filtered_directory -- the output directory of filtered MemoryCards, like "./ecl/memory/MemoryCards.json"
"""
with open(directory) as file:
content = json.load(file)
new_content = []
for memorypiece in content:
experiences = memorypiece.get("experiences")
filtered_experienceList = []
if experiences != None:
print("origin:",len(experiences))
for experience in experiences:
valueGain = experience.get("valueGain")
print(valueGain)
if valueGain >= filter_threshold:
filtered_experienceList.append(experience)
print(len(experiences))
memorypiece["experiences"] = filtered_experienceList
new_content.append(memorypiece)
else:
new_content.append(memorypiece)
file.close()
with open(filtered_directory, 'w') as file:
json.dump(content, file)
file.close()
def main():
parser = argparse.ArgumentParser(description="Process some directories.")
parser.add_argument("threshold", type=float, help="The filtered threshold for experiences")
parser.add_argument("directory", type = str, help="The directory to process")
parser.add_argument("filtered_directory", type= str, help="The directory for output")
args = parser.parse_args()
filter_threshold = args.threshold
filter_valuegain(args.directory, args.filtered_directory)
if __name__ == "__main__":
main()

176
ecl/utils.py Normal file
View File

@ -0,0 +1,176 @@
import subprocess
import json
import yaml
import time
import logging
from easydict import EasyDict
import openai
from openai import OpenAI
import numpy as np
import os
from abc import ABC, abstractmethod
import tiktoken
from typing import Any, Dict
from tenacity import (
retry,
stop_after_attempt,
wait_exponential
)
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
if 'BASE_URL' in os.environ:
BASE_URL = os.environ['BASE_URL']
else:
BASE_URL = None
def getFilesFromType(sourceDir, filetype):
files = []
for root, directories, filenames in os.walk(sourceDir):
for filename in filenames:
if filename.endswith(filetype):
files.append(os.path.join(root, filename))
return files
def cmd(command: str):
print(">> {}".format(command))
text = subprocess.run(command, shell=True, text=True, stdout=subprocess.PIPE).stdout
return text
def get_easyDict_from_filepath(path: str):
# print(path)
if path.endswith('.json'):
with open(path, 'r', encoding="utf-8") as file:
config_map = json.load(file, strict=False)
config_easydict = EasyDict(config_map)
return config_easydict
if path.endswith('.yaml'):
file_data = open(path, 'r', encoding="utf-8").read()
config_map = yaml.load(file_data, Loader=yaml.FullLoader)
config_easydict = EasyDict(config_map)
return config_easydict
return None
def calc_max_token(messages, model):
string = "\n".join([message["content"] for message in messages])
encoding = tiktoken.encoding_for_model(model)
num_prompt_tokens = len(encoding.encode(string))
gap_between_send_receive = 50
num_prompt_tokens += gap_between_send_receive
num_max_token_map = {
"gpt-3.5-turbo": 4096,
"gpt-3.5-turbo-16k": 16384,
"gpt-3.5-turbo-0613": 4096,
"gpt-3.5-turbo-16k-0613": 16384,
"gpt-4": 8192,
"gpt-4-0613": 8192,
"gpt-4-32k": 32768,
}
num_max_token = num_max_token_map[model]
num_max_completion_tokens = num_max_token - num_prompt_tokens
return num_max_completion_tokens
class ModelBackend(ABC):
r"""Base class for different model backends.
May be OpenAI API, a local LLM, a stub for unit tests, etc."""
@abstractmethod
def run(self, *args, **kwargs) -> Dict[str, Any]:
r"""Runs the query to the backend model.
Raises:
RuntimeError: if the return value from OpenAI API
is not a dict that is expected.
Returns:
Dict[str, Any]: All backends must return a dict in OpenAI format.
"""
pass
class OpenAIModel(ModelBackend):
r"""OpenAI API in a unified ModelBackend interface."""
def __init__(self, model_type, model_config_dict: Dict=None) -> None:
super().__init__()
self.model_type = model_type
self.model_config_dict = model_config_dict
if self.model_config_dict == None:
self.model_config_dict = {"temperature": 0.2,
"top_p": 1.0,
"n": 1,
"stream": False,
"frequency_penalty": 0.0,
"presence_penalty": 0.0,
"logit_bias": {},
}
self.prompt_tokens = 0
self.completion_tokens = 0
self.total_tokens = 0
@retry(wait=wait_exponential(min=5, max=60), stop=stop_after_attempt(5))
def run(self, messages) :
if BASE_URL:
client = openai.OpenAI(
api_key=OPENAI_API_KEY,
base_url=BASE_URL,
)
else:
client = openai.OpenAI(
api_key=OPENAI_API_KEY
)
current_retry = 0
max_retry = 5
string = "\n".join([message["content"] for message in messages])
encoding = tiktoken.encoding_for_model(self.model_type)
num_prompt_tokens = len(encoding.encode(string))
gap_between_send_receive = 15 * len(messages)
num_prompt_tokens += gap_between_send_receive
num_max_token_map = {
"gpt-3.5-turbo": 4096,
"gpt-3.5-turbo-16k": 16384,
"gpt-3.5-turbo-0613": 4096,
"gpt-3.5-turbo-16k-0613": 16384,
"gpt-4": 8192,
"gpt-4-0613": 8192,
"gpt-4-32k": 32768,
}
response = client.chat.completions.create(messages = messages,
model = "gpt-3.5-turbo-16k",
temperature = 0.2,
top_p = 1.0,
n = 1,
stream = False,
frequency_penalty = 0.0,
presence_penalty = 0.0,
logit_bias = {},
).model_dump()
response_text = response['choices'][0]['message']['content']
num_max_token = num_max_token_map[self.model_type]
num_max_completion_tokens = num_max_token - num_prompt_tokens
self.model_config_dict['max_tokens'] = num_max_completion_tokens
log_and_print_online(
"InstructionStar generation:\n**[OpenAI_Usage_Info Receive]**\nprompt_tokens: {}\ncompletion_tokens: {}\ntotal_tokens: {}\n".format(
response["usage"]["prompt_tokens"], response["usage"]["completion_tokens"],
response["usage"]["total_tokens"]))
self.prompt_tokens += response["usage"]["prompt_tokens"]
self.completion_tokens += response["usage"]["completion_tokens"]
self.total_tokens += response["usage"]["total_tokens"]
if not isinstance(response, Dict):
raise RuntimeError("Unexpected return from OpenAI API")
return response
def now():
return time.strftime("%Y%m%d%H%M%S", time.localtime())
def log_and_print_online(content=None):
if content is not None:
print(content)
logging.info(content)

66
wiki.md
View File

@ -131,6 +131,62 @@ then start building a software by ``python3 run.py`` and go to [Visualizer Websi
### Official Docker Image
- in preparation
## Experiential Co-Learning Guide
### Co-Tracking
- **Start Co-Tracking**: Use the following command to initiate the building of software, replacing `[description_of_your_idea]` with task descirption and `[project_name]` with project name. This is the same as starting ChatDev.
```bash
python3 run.py --task "[description_of_your_idea]" --name "[project_name]"
```
The software generated in co-tracking phase is ready for the agents' experience pool in the following steps.
### Co-Memorizing
- **Initiating Co-Memorizing**: To begin the memorization process for the generated software in a specified directory, run the `ecl.py` script using the following command:
```bash
python3 ecl/ecl.py "<path>" "[options]"
```
`<path>`: The path to the file or directory to process.
`[options]`: This can be set as `-d`. This flag indicates that the script should process all files in the given directory. If this flag is not set, the script will process the file specified in path.
After this process, the experiences have been extracted from the production of software and added to the agents' experience pool in `ecl/memory/MemoryCards.json`.
\
**For example:**
It you want to memorize only one software, you can use:
```bash
python3 ecl/ecl.py "<Software Path to file>"
```
And the software path should be like `"WareHouse/project_name_DefaultOrganization_timestamp"`.
\
If you want to memorize all files in a directory, you can use:
```bash
python3 ecl/ecl.py "<Software Path to Directory>" -d
```
the software path should be like `"WareHouse"`.
- **Memory Filter**: To get a higher quality experience pool, it is suggested to use `ecl/post_process/memory_filter.py` to filter the `MemoryCards.json`. When running the `memory_filter.py` script, you need to specify three arguments: the filter threshold, the input directory, and the output directory.
```bash
python3 ecl/post_process/memory_filter.py "<threshold>" "<directory>" "<filtered_directory>"
```
- `<threshold>`: Require a value within the range of 0 to 1 (exclusive). It is used as the threshold to filter experiences by their 'valuegain'. Only experiences with a 'valuegain' that is equal to or greater than this threshold will be considered.
- `<directory>`: The file path to the memory directory that you intend to process.
- `<filtered_directory>`: The file path to a directory where you want to store the processed data.
\
**For example:**
```bash
python3 ecl/post_process/memory_filter.py 0.9 "ecl/memory/MemoryCards.json" "ecl/memory/MemoryCards_filtered.json"
```
> **Notice:** By default, the `MemoryCards.json` is set to be empty. You can customize your own experience pool for agents following steps above. And we have also provided our `MemoryCards.json` used in our experiment in [MemoryCards.json](https://drive.google.com/drive/folders/1czsR4swQyqpoN8zwN0-rSFcTVl68zTDY?usp=sharing). You can download the json file through the link and put it under `ecl/memory` folder. This allows you to directly proceed to the Co-Reasoning phase without needing to redo the Co-Tracking and Co-Memorizing steps.
### Co-Reasoning
- **Memory Usage Configuration**:
In the `CompanyConfig/Default/ChatChainConfig.json` file, the `with_memory` option should be set **True**. \
In the `ecl/config.yaml` file, you can adjust the settings for **top k** and **similarity threshold** for both code and text retrieval.
By default, `with_memory` is set as False and the system is configured to retrieve the top 1 result with a similarity threshold of zero for both code and text.
- **Start Co-Reasoning**: Once you have completed memory usage configuration, similar to the Co-Tracking phase, you can use the command below to start the software building process. Replace `[description_of_your_idea]` with the task description from the test set and `[project_name]` with the project name from the test set:
```
python3 run.py --task "[description_of_your_idea]" --name "[project_name]"
```
In this process of software development, the agents will engage their experience pool(`MemoryCards.json`) into software development!
Detailed descriptions and experiment results about this **Experiential Co-Learning** Module lies in our preprint paper at https://arxiv.org/abs/2312.17025.
## Customization
- You can customize your company in three kinds of granularity:
@ -278,6 +334,7 @@ then start building a software by ``python3 run.py`` and go to [Visualizer Websi
- *self_improve*: flag for self-improvement on user input prompt. It is a special chat that LLM plays as a prompt engineer to improve the user input prompt. **⚠️ Attention** Model generated prompts contain uncertainty and there may
be a deviation from the requirement meaning contained in the original prompt.
- *background_prompt*: background prompt that will be added to every inquiry to LLM
- *with_memory*: Whether to utilize the experience pool for agents. The experience pool actually lies in in `ecl/memory/MemoryCards.json`.
- params in SimplePhase:
- *max_turn_step*: Max number of chatting turn. You can increase max_turn_step for better performance but it will
take a longer time to finish the phase.
@ -290,10 +347,11 @@ then start building a software by ``python3 run.py`` and go to [Visualizer Websi
```commandline
├── CompanyConfig # Configuration Files for ChatDev, including ChatChain, Phase and Role config json.
├── WareHouse # Folder for generated software
├── camel # Camel RolePlay component
├── chatdev # ChatDev core code
├── misc # assets of example and demo
├── WareHouse # Folder for Generated Software
├── camel # Camel RolePlay Component
├── chatdev # ChatDev Core Code
├── ecl # Experiential Co-Learning Module
├── misc # Assets of Example and Demo
├── visualizer # Visualizer Folder
├── run.py # Entry of ChatDev
├── requirements.txt