ChatDev/ecl/experience.py
2024-01-25 10:10:15 +08:00

312 lines
14 KiB
Python

import os
import time
from graph import Graph, Node, Edge
import sys
import openai
import numpy as np
from codes import Codes
from utils import get_easyDict_from_filepath,OpenAIModel,log_and_print_online
from embedding import OpenAIEmbedding
sys.path.append(os.path.join(os.getcwd(),"ecl"))
class Shortcut:
def __init__(self, sourceMID, targetMID, valueGain,instructionStar,edgeIDPath):
self.sourceMID = sourceMID
self.targetMID = targetMID
self.valueGain = valueGain
self.embedding = None
self.instructionStar = instructionStar
self.edgeIDPath = edgeIDPath
def __str__(self):
return "{} -> {} valueGain={:.6f} len(instructionPath)={} instructionStar={}".format(self.sourceMID, self.targetMID, self.valueGain, len(self.edgeIDPath), self.instructionStar[:100].replace("\n", ""))
class Experience:
def __init__(self, graph: Graph, directory: str):
cfg = get_easyDict_from_filepath("./ecl/config.yaml")
self.graph: Graph = graph
self.directory = directory
self.threshold = cfg.experience.threshold
self.upperLimit = cfg.experience.upper_limit
self.experiences = []
self.model = OpenAIModel(model_type="gpt-3.5-turbo-16k")
self.embedding_method = OpenAIEmbedding()
for edge in self.graph.edges:
node = self.graph.nodes[edge.targetMID]
node.degree += 1
assert len(self.graph.edges) * 1 == sum([self.graph.nodes[mid].degree for mid in self.graph.nodes.keys()]) # unidirectional
for mid in self.graph.nodes.keys():
node = self.graph.nodes[mid]
node.value = 1.0
def reap_zombie(self):
pathNodes, pathEdges = self.graph.find_shortest_path()
zombieEdges = [edge for edge in self.graph.edges if edge not in pathEdges]
zombieNodes = [self.graph.nodes[mid] for mid in self.graph.nodes.keys() if mid not in pathNodes]
log_zombieedges = "ZOMBIE EDGES: \n"
log_zombienodes = "ZOMBIE NODES: \n"
for edge in zombieEdges:
self.graph.edges.remove(edge)
log_zombieedges += "Zombie Edge {} -> {} Removed\n".format(edge.sourceMID, edge.targetMID)
log_and_print_online(log_zombieedges)
for node in zombieNodes:
del self.graph.nodes[node.mID]
log_zombienodes += "Zombie Node {} Removed\n".format(node.mID)
log_and_print_online(log_zombienodes)
def estimate(self):
if len(self.graph.edges) == 0:
return
for mid in self.graph.nodes.keys():
node = self.graph.nodes[mid]
if len(node.code) == 0:
node.value *= 0.0
log_and_print_online()
vn = self.graph.nodes[self.graph.edges[-1].targetMID]
# print(vn.mID, "...")
for mid in self.graph.nodes.keys():
# print(mid)
vi = self.graph.nodes[mid]
vi.value = self._pairwise_estimate(vi, vn)
log_and_print_online("Init value:"+ str({mid: self.graph.nodes[mid].value for mid in self.graph.nodes.keys()})+"\n\nEstimated value:"+str({mid: self.graph.nodes[mid].value for mid in self.graph.nodes.keys()}))
def get_cosine_similarity(self, embeddingi, embeddingj):
embeddingi = np.array(embeddingi)
embeddingj = np.array(embeddingj)
cos_sim = embeddingi.dot(embeddingj) / (np.linalg.norm(embeddingi) * np.linalg.norm(embeddingj))
return cos_sim
def _pairwise_estimate(self, vi: Node, vj: Node):
if vi.value == 0.0:
return 0.0
pathNodes, pathEdges = self.graph.find_shortest_path(vi.mID, vj.mID)
distance_weight = 1.0 / len(pathEdges) if len(pathEdges) != 0 else 1.0
codes = Codes(vi.code)
codes._rewrite_codes()
(exist_bugs_flag, test_reports) = codes._run_codes()
compile_weight = 0.0 if exist_bugs_flag else 1.0
if compile_weight == 0.0:
return 0.0
maximum_degree = max([self.graph.nodes[mid].degree for mid in self.graph.nodes.keys()])
degree_weight = vi.degree * 1.0 / maximum_degree
if degree_weight == 0.0:
return 0.0
start_time = time.time()
vi_code_emb = self.embedding_method.get_code_embedding(vi.code) if vi.embedding is None else vi.embedding
if vi.embedding is None:
end_time =time.time()
log_and_print_online("DONE:get node embedding\ntime cost:{}\n".format(end_time-start_time))
vi.embedding = vi_code_emb
start_time = time.time()
vj_code_emb = self.embedding_method.get_code_embedding(vj.code) if vj.embedding is None else vj.embedding
if vj.embedding is None:
end_time =time.time()
log_and_print_online("DONE:get node embedding\ntime cost:{}\n".format(end_time-start_time))
vj.embedding = vj_code_emb
code_code_cos_sim = self.get_cosine_similarity(vi_code_emb, vj_code_emb)
if code_code_cos_sim == 0.0:
return 0.0
filenames = os.listdir(self.directory)
filename = [filename for filename in filenames if filename.endswith(".prompt")][0]
task_prompt = open(os.path.join(self.directory, filename), "r").read().strip()
start_time = time.time()
task_emb = self.embedding_method.get_text_embedding(task_prompt) if self.graph.task_embedding is None else self.graph.task_embedding
if self.graph.task_embedding is None:
end_time =time.time()
log_and_print_online("DONE:get task prompt embedding\ntime cost:{}\n".format(end_time-start_time))
self.graph.task = task_prompt
self.graph.task_embedding = task_emb
code_text_cos_sim = self.get_cosine_similarity(vi_code_emb, task_emb)
if code_text_cos_sim == 0.0:
return 0.0
assert distance_weight >= 0.0 and distance_weight <= 1.0
assert compile_weight >= 0.0 and compile_weight <= 1.0
assert degree_weight >= 0.0 and degree_weight <= 1.0
distance = vj.version - vi.version
if distance == 0:
return 1
else:
return code_code_cos_sim * 1.0 / distance * code_text_cos_sim * compile_weight * degree_weight
#return distance_weight * compile_weight * degree_weight
def get_transitive_closure(self):
def print_matrix(matrix):
for nodei in matrix.keys():
for nodej in matrix.keys():
print(matrix[nodei][nodej], end=" ")
print()
print()
# Warshall Algorithm
matrix = {}
for mid1 in self.graph.nodes:
for mid2 in self.graph.nodes:
if mid1 not in matrix.keys():
matrix[mid1] = {}
matrix[mid1][mid2] = 0
# print_matrix(matrix)
pathNodes, pathEdges = self.graph.find_shortest_path()
for edge in pathEdges:
matrix[edge.sourceMID][edge.targetMID] = 1
print("Init Adjacent Matrix:")
print_matrix(matrix)
for nodek in matrix.keys():
for nodei in matrix.keys():
for nodej in matrix.keys():
if matrix[nodei][nodej] == 1 or (matrix[nodei][nodek] == 1 and matrix[nodek][nodej] == 1):
matrix[nodei][nodej] = 1
print("Transitive Closure:")
print_matrix(matrix)
return matrix
def extract_thresholded_experiences(self):
if len(self.graph.edges) == 0:
return []
if len(self.graph.nodes) < 2:
return []
assert len(self.graph.nodes.keys()) >= 2
matrix = self.get_transitive_closure()
experiences = []
pathNodes, _ = self.graph.find_shortest_path()
for id1 in pathNodes:
for id2 in pathNodes:
valueGain = self.graph.nodes[id2].value - self.graph.nodes[id1].value
flag0 = id1 != id2
flag1 = self.graph.exists_edge(id1, id2) == False
flag2 = matrix[id1][id2] == 1
flag3 = valueGain >= self.threshold
code_lines = [line.lower().strip() for line in self.graph.nodes[id2].code.split("\n")]
flag4 = not ("pass".lower() in code_lines or "TODO".lower() in code_lines)
if flag0 and flag1 and flag2 and flag3 and flag4:
_, edges = self.graph.find_shortest_path(uMID=id1, vMID=id2)
edgeIDPath = [edge.edgeId for edge in edges]
sourcecode=self.graph.nodes[id1].code
targetcode=self.graph.nodes[id2].code
shortcut = Shortcut(sourceMID=id1, targetMID=id2, valueGain=valueGain,instructionStar="", edgeIDPath=edgeIDPath)
experiences.append(shortcut)
experiences = sorted(experiences, key=lambda item: item.valueGain, reverse = True)
if len(experiences) > self.upperLimit:
log_and_print_online("{} experieces truncated.".format(len(experiences) - self.upperLimit))
experiences = experiences[:self.upperLimit]
prompt_template0 = """Provide detailed instructions to generate the following code:
{targetcode}
The instructions should encompass:
Modules and Classes:
- Enumerate necessary modules.
- Detail the classes, their attributes, and methods within these modules.
- Articulate the purpose and operation of each class.
Data Structures:
- Identify the requisite data structures.
- Describe their names, attributes, and operations.
Main Program Flow:
- Outline the principal progression of the program.
- Highlight the sequence for initializing and invoking other modules, classes, and methods within the primary file (e.g., main.py).
- Clarify the logical progression during runtime.
Input and Output:
- Specify the method by which the program accepts input, be it from users or external sources.
- Elaborate on the projected outputs or actions of the software.
Exception Handling:
- Instruct on the approach to manage potential anomalies or exceptions during execution to ascertain stability and robustness.
External Libraries and Dependencies:
- Explicitly list the necessary external libraries or dependencies, their versions, and their functionalities.
Please output the instructions directly."""
prompt_template1 = """Please provide detailed instructions on how to transition from the initial code version represented by source code to the final version indicated by target code.
Source Code:
{sourcecode}
Target Code:
{targetcode}
The instructions should encompass:
Modules and Classes: Detail the modules to be incorporated, along with the names, attributes, and operations of any classes to be added or amended. Furthermore, describe the intended function and utility of these new or altered classes.
Data Structures: Clearly define any data structures that need introduction or alteration, elucidating their names, attributes, and functionalities.
Main Program Flow: Outline the program's primary sequence of operations, highlighting the procedures to initialize and invoke other modules, classes, and methods in the primary file (e.g., main.py). Describe the program's logic sequence during its execution.
Input and Output: Define the methodology by which the program will acquire input, whether from users or external data sources. Also, characterize the projected outputs or behaviors of the application.
Exception Handling: Provide guidance on managing potential discrepancies or exceptions that might emerge during the software's operation, ensuring its resilience and reliability.
External Libraries and Dependencies: If the implementation requires external libraries or dependencies, specify their names, versions, and their respective purposes explicitly."""
for shortcut in experiences:
sourcecode = self.graph.nodes[shortcut.sourceMID].code
targetcode = self.graph.nodes[shortcut.targetMID].code
if sourcecode == "":
prompt = prompt_template0.replace("{targetcode}", targetcode)
response = self.model.run(messages=[{"role": "system", "content": prompt}])
print("instructionstar generated")
else:
prompt = prompt_template1.replace("{sourcecode}", sourcecode).replace("{targetcode}", targetcode)
response = self.model.run(messages=[{"role": "system", "content": prompt}])
print("instructionstar generated")
shortcut.instructionStar = response["choices"][0]["message"]["content"]
output = "Sorted-and-Truncated Experiences (with instructionStar):"
self.experiences = experiences
for experience in experiences:
output += str(experience)
log_and_print_online(output)
log_and_print_online("[Conclusion]:\nprompt_tokens:{}, completion_tokens:{}, total_tokens:{}".format(self.model.prompt_tokens,self.model.completion_tokens,self.model.total_tokens))
log_and_print_online("[Conclusion]:\ntext_prompt_tokens:{}, text_total_tokens:{}\ncode_prompt_tokens:{}, code_total_tokens:{}\nprompt_tokens:{}, total_tokens:{}".format(self.embedding_method.text_prompt_tokens,
self.embedding_method.text_total_tokens,
self.embedding_method.code_prompt_tokens,
self.embedding_method.code_total_tokens,
self.embedding_method.prompt_tokens,
self.embedding_method.total_tokens))
return experiences
def to_dict(self):
merged_data = []
for index, ex in enumerate(self.experiences):
merged_data.append(ex.__dict__)
return merged_data