├── 2024 ├── practice.zip └── practice │ ├── fall_in_line.cpp │ ├── fall_in_line.in │ ├── fall_in_line.md │ ├── fall_in_line.out │ ├── fall_in_line.png │ ├── fall_in_line_sample_input.txt │ ├── fall_in_line_sample_output.txt │ ├── line_by_line.cpp │ ├── line_by_line.in │ ├── line_by_line.md │ ├── line_by_line.out │ ├── line_by_line_sample_input.txt │ ├── line_by_line_sample_output.txt │ ├── line_of_delivery_part_1.cpp │ ├── line_of_delivery_part_1.in │ ├── line_of_delivery_part_1.md │ ├── line_of_delivery_part_1.out │ ├── line_of_delivery_part_1.png │ ├── line_of_delivery_part_1_sample_input.txt │ ├── line_of_delivery_part_1_sample_output.txt │ ├── line_of_delivery_part_2.cpp │ ├── line_of_delivery_part_2.in │ ├── line_of_delivery_part_2.md │ ├── line_of_delivery_part_2.out │ ├── line_of_delivery_part_2.png │ ├── line_of_delivery_part_2_sample_input.txt │ ├── line_of_delivery_part_2_sample_output.txt │ ├── walk_the_line.cpp │ ├── walk_the_line.in │ ├── walk_the_line.md │ ├── walk_the_line.out │ ├── walk_the_line_sample_input.txt │ └── walk_the_line_sample_output.txt ├── .gitignore ├── README.md ├── __init__.py ├── agent.py ├── download.py ├── llamaindex_workflow.ipynb ├── llamaindex_workflow.py ├── mistral.py ├── one_shot.py ├── one_shot_o1.py ├── one_shot_solver.ipynb ├── rag_code_agent.ipynb ├── requirements.txt ├── retriever.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | dataset/ 2 | -------------------------------------------------------------------------------- /2024/practice.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wandb/aihackercup/0ad5cde4d5bb64572bdb8373296e87392995e5f7/2024/practice.zip -------------------------------------------------------------------------------- /2024/practice/fall_in_line.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | using namespace std; 19 | 20 | // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0200r0.html 21 | template class y_combinator_result { 22 | Fun fun_; 23 | public: 24 | template explicit y_combinator_result(T &&fun): fun_(std::forward(fun)) {} 25 | template decltype(auto) operator()(Args &&...args) { return fun_(std::ref(*this), std::forward(args)...); } 26 | }; 27 | template decltype(auto) y_combinator(Fun &&fun) { return y_combinator_result>(std::forward(fun)); } 28 | 29 | 30 | template ostream& operator<<(ostream &os, const pair &p) { return os << '(' << p.first << ", " << p.second << ')'; } 31 | template ostream& operator<<(ostream& os, const tuple& t) { os << '('; apply([&os](const Args&... args) { size_t n = 0; ((os << args << (++n != sizeof...(Args) ? ", " : "")), ...); }, t); return os << ')'; } 32 | template::value, typename T_container::value_type>::type> ostream& operator<<(ostream &os, const T_container &v) { os << '{'; string sep; for (const T &x : v) os << sep << x, sep = ", "; return os << '}'; } 33 | 34 | void dbg_out() { cerr << endl; } 35 | template void dbg_out(Head H, Tail... T) { cerr << ' ' << H; dbg_out(T...); } 36 | #ifdef NEAL_DEBUG 37 | #define dbg(...) cerr << '[' << __FILE__ << ':' << __LINE__ << "] (" << #__VA_ARGS__ << "):", dbg_out(__VA_ARGS__) 38 | #else 39 | #define dbg(...) 40 | #endif 41 | 42 | // TODO: set this to false if it's unnecessary and the time limit might be tight. 43 | // CHECK_OVERFLOW64 = true can run up to 2 times slower (particularly on CF). 44 | const bool CHECK_OVERFLOW64 = true; 45 | 46 | using dist_t = long double; 47 | 48 | struct point { 49 | int64_t x, y; 50 | 51 | point() : x(0), y(0) {} 52 | 53 | point(int64_t _x, int64_t _y) : x(_x), y(_y) {} 54 | 55 | point& operator+=(const point &other) { x += other.x; y += other.y; return *this; } 56 | point& operator-=(const point &other) { x -= other.x; y -= other.y; return *this; } 57 | point& operator*=(int64_t mult) { x *= mult; y *= mult; return *this; } 58 | 59 | point operator+(const point &other) const { return point(*this) += other; } 60 | point operator-(const point &other) const { return point(*this) -= other; } 61 | point operator*(int64_t mult) const { return point(*this) *= mult; } 62 | 63 | bool operator==(const point &other) const { return x == other.x && y == other.y; } 64 | bool operator!=(const point &other) const { return !(*this == other); } 65 | 66 | point operator-() const { return point(-x, -y); } 67 | point rotate90() const { return point(-y, x); } 68 | 69 | int64_t norm() const { 70 | return (int64_t) x * x + (int64_t) y * y; 71 | } 72 | 73 | dist_t dist() const { 74 | return sqrt(dist_t(norm())); 75 | } 76 | 77 | bool top_half() const { 78 | return y > 0 || (y == 0 && x > 0); 79 | } 80 | 81 | friend ostream& operator<<(ostream &os, const point &p) { 82 | return os << '(' << p.x << ", " << p.y << ')'; 83 | } 84 | }; 85 | 86 | int64_t cross(const point &a, const point &b) { 87 | return (int64_t) a.x * b.y - (int64_t) b.x * a.y; 88 | } 89 | 90 | int64_t dot(const point &a, const point &b) { 91 | return (int64_t) a.x * b.x + (int64_t) a.y * b.y; 92 | } 93 | 94 | int cross_sign(const point &a, const point &b) { 95 | if (CHECK_OVERFLOW64) { 96 | long double double_value = (long double) a.x * b.y - (long double) b.x * a.y; 97 | 98 | if (abs(double_value) > 1e18) 99 | return (double_value > 0) - (double_value < 0); 100 | } 101 | 102 | uint64_t uint64_value = (uint64_t) a.x * b.y - (uint64_t) b.x * a.y; 103 | int64_t actual = int64_t(uint64_value); 104 | return (actual > 0) - (actual < 0); 105 | } 106 | 107 | bool left_turn_strict(const point &a, const point &b, const point &c) { 108 | return cross_sign(b - a, c - a) > 0; 109 | } 110 | 111 | bool left_turn_lenient(const point &a, const point &b, const point &c) { 112 | return cross_sign(b - a, c - a) >= 0; 113 | } 114 | 115 | bool collinear(const point &a, const point &b, const point &c) { 116 | return cross_sign(b - a, c - a) == 0; 117 | } 118 | 119 | // Returns twice the signed area formed by three points in a triangle. Positive when a -> b -> c is a left turn. 120 | int64_t area_signed_2x(const point &a, const point &b, const point &c) { 121 | return cross(b - a, c - a); 122 | } 123 | 124 | dist_t distance_to_line(const point &p, const point &a, const point &b) { 125 | assert(a != b); 126 | return dist_t(abs(area_signed_2x(p, a, b))) / (a - b).dist(); 127 | } 128 | 129 | int64_t manhattan_dist(const point &a, const point &b) { 130 | return (int64_t) abs(a.x - b.x) + abs(a.y - b.y); 131 | } 132 | 133 | int64_t infinity_norm_dist(const point &a, const point &b) { 134 | return max(abs(a.x - b.x), abs(a.y - b.y)); 135 | } 136 | 137 | // Sort in increasing order of y, with ties broken in increasing order of x. 138 | bool yx_compare(const point &a, const point &b) { 139 | return make_pair(a.y, a.x) < make_pair(b.y, b.x); 140 | } 141 | 142 | // Sort in increasing order of angle to the x-axis. 143 | bool angle_compare(const point &a, const point &b) { 144 | if (a.top_half() ^ b.top_half()) 145 | return a.top_half(); 146 | 147 | return cross_sign(a, b) > 0; 148 | } 149 | 150 | 151 | const int ITERS = 100; 152 | 153 | uint64_t random_address() { char *p = new char; delete p; return uint64_t(p); } 154 | 155 | const uint64_t SEED = chrono::steady_clock::now().time_since_epoch().count() * (random_address() | 1); 156 | mt19937_64 rng(SEED); 157 | 158 | void run_case(int test_case) { 159 | int N; 160 | cin >> N; 161 | vector ants(N); 162 | 163 | for (auto &ant : ants) 164 | cin >> ant.x >> ant.y; 165 | 166 | int most = 0; 167 | 168 | for (int iter = 0; iter < ITERS; iter++) { 169 | int a, b; 170 | 171 | do { 172 | a = int(rng() % N); 173 | b = int(rng() % N); 174 | } while (a == b); 175 | 176 | int line = 2; 177 | 178 | for (int i = 0; i < N; i++) 179 | if (i != a && i != b && collinear(ants[a], ants[b], ants[i])) 180 | line++; 181 | 182 | most = max(most, line); 183 | } 184 | 185 | cout << "Case #" << test_case << ": " << N - most << '\n'; 186 | } 187 | 188 | int main() { 189 | int tests; 190 | cin >> tests; 191 | 192 | for (int tc = 1; tc <= tests; tc++) { 193 | run_case(tc); 194 | cout << flush; 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /2024/practice/fall_in_line.md: -------------------------------------------------------------------------------- 1 | # Problem C: Fall in Line 2 | 3 | As the queen of an ant colony, it’s your job to ensure that the entire colony works together. Your colony has 4 | N worker ants, the ith of which is currently at coordinates (Xi, Yi). To align the efforts of all of your worker ants, you would like them to all be on the same line on the plane. How many of your ants need to move to get them to all lie on the same line? 5 | 6 | As is frequently the case in management, you don’t need an exact answer, but you do need some degree of accuracy. If the true minimum number of ants that need to move is M, then any answer between M and 2*M (inclusive) will be accepted. 7 | 8 | ## Constraints 9 | 10 | 1 ≤ T ≤ 75 11 | 2 ≤ N ≤ 1,000,000 12 | 0 ≤ ∣Xi∣, ∣Yi∣ ≤ 1,000,000,000 13 | 14 | In each test case, no two ants will be at the same position. 15 | The sum of N across all test cases is at most 4,000,000. 16 | 17 | ## Input Format 18 | 19 | Input begins with an integer T, the number of test cases. Each case starts with a line that contains the integer N. Then N lines follow, the ith of which contains the integers Xi and Yi. 20 | 21 | ## Output Format 22 | 23 | For the i-th test case, print "Case #i: " followed by the number of ants you need to move to get all of the ants to lie on the same line. 24 | 25 | ## Sample Explanation 26 | 27 | In the first case, the 4 ants are all on the line y = x, so no ants need to be moved. 0 is the only answer that will be accepted for this case. 28 | 29 | In the second case, the 4 ants are at the vertices of a square, so every line contains at most 2 of the 4 ants. 2 ants need to be moved, so the answers 2, 3, and 4 will be accepted for this case. 30 | 31 | The third case is depicted below. Ants 2, 4, 5, and 7 all lie on the line y = 3/2 x + 1. Moving the other 3 ants is the optimal way to get all of the ants on a single line, so any answer between 3 and 6 inclusive will be accepted for this case. 32 | 33 | ![Fall in Line](fall_in_line.png) -------------------------------------------------------------------------------- /2024/practice/fall_in_line.out: -------------------------------------------------------------------------------- 1 | Case #1: 0 2 | Case #2: 2 3 | Case #3: 3 4 | Case #4: 7984 5 | Case #5: 1978 6 | Case #6: 6076 7 | Case #7: 4767 8 | Case #8: 3927 9 | Case #9: 3 10 | Case #10: 3 11 | Case #11: 7 12 | Case #12: 15 13 | Case #13: 8 14 | -------------------------------------------------------------------------------- /2024/practice/fall_in_line.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wandb/aihackercup/0ad5cde4d5bb64572bdb8373296e87392995e5f7/2024/practice/fall_in_line.png -------------------------------------------------------------------------------- /2024/practice/fall_in_line_sample_input.txt: -------------------------------------------------------------------------------- 1 | 3 2 | 4 3 | 1 1 4 | 2 2 5 | -3 -3 6 | 4 4 7 | 4 8 | 1 1 9 | -1 1 10 | 1 -1 11 | -1 -1 12 | 7 13 | 4 8 14 | 2 4 15 | 7 2 16 | 6 10 17 | 0 1 18 | 3 4 19 | 4 7 20 | -------------------------------------------------------------------------------- /2024/practice/fall_in_line_sample_output.txt: -------------------------------------------------------------------------------- 1 | Case #1: 0 2 | Case #2: 2 3 | Case #3: 3 4 | -------------------------------------------------------------------------------- /2024/practice/line_by_line.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | using namespace std; 19 | 20 | // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0200r0.html 21 | template class y_combinator_result { 22 | Fun fun_; 23 | public: 24 | template explicit y_combinator_result(T &&fun): fun_(std::forward(fun)) {} 25 | template decltype(auto) operator()(Args &&...args) { return fun_(std::ref(*this), std::forward(args)...); } 26 | }; 27 | template decltype(auto) y_combinator(Fun &&fun) { return y_combinator_result>(std::forward(fun)); } 28 | 29 | 30 | template ostream& operator<<(ostream &os, const pair &p) { return os << '(' << p.first << ", " << p.second << ')'; } 31 | template ostream& operator<<(ostream& os, const tuple& t) { os << '('; apply([&os](const Args&... args) { size_t n = 0; ((os << args << (++n != sizeof...(Args) ? ", " : "")), ...); }, t); return os << ')'; } 32 | template::value, typename T_container::value_type>::type> ostream& operator<<(ostream &os, const T_container &v) { os << '{'; string sep; for (const T &x : v) os << sep << x, sep = ", "; return os << '}'; } 33 | 34 | void dbg_out() { cerr << endl; } 35 | template void dbg_out(Head H, Tail... T) { cerr << ' ' << H; dbg_out(T...); } 36 | #ifdef NEAL_DEBUG 37 | #define dbg(...) cerr << '[' << __FILE__ << ':' << __LINE__ << "] (" << #__VA_ARGS__ << "):", dbg_out(__VA_ARGS__) 38 | #else 39 | #define dbg(...) 40 | #endif 41 | 42 | 43 | void run_case(int test_case) { 44 | cout << fixed << setprecision(15); 45 | int N, _P; 46 | cin >> N >> _P; 47 | double P = 0.01 * _P; 48 | 49 | // P^(N - 1) = Q^N 50 | double Q = pow(P, 1 - 1.0 / N); 51 | 52 | cout << "Case #" << test_case << ": " << 100 * (Q - P) << '\n'; 53 | } 54 | 55 | int main() { 56 | int tests; 57 | cin >> tests; 58 | 59 | for (int tc = 1; tc <= tests; tc++) { 60 | run_case(tc); 61 | cout << flush; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /2024/practice/line_by_line.in: -------------------------------------------------------------------------------- 1 | 24 2 | 2 50 3 | 3 10 4 | 13 37 5 | 950 95 6 | 2 1 7 | 2 2 8 | 2 3 9 | 2 4 10 | 2 5 11 | 2 95 12 | 2 96 13 | 2 97 14 | 2 98 15 | 2 99 16 | 1000 1 17 | 1000 2 18 | 1000 3 19 | 1000 4 20 | 1000 5 21 | 1000 95 22 | 1000 96 23 | 1000 97 24 | 1000 98 25 | 1000 99 26 | -------------------------------------------------------------------------------- /2024/practice/line_by_line.md: -------------------------------------------------------------------------------- 1 | # Problem B: Line by Line 2 | 3 | You’ve found a solution to an implementation-heavy geometry problem that requires typing out N lines of code. Annoyingly, you only have a P% chance of typing out any given line without a mistake, and your code will only be accepted if all N lines are correct. The chance of making a mistake in one line is independent of the chance of making a mistake in any other line. 4 | 5 | You realize there might be a solution which only requires N−1 lines (each also having a P% chance of being typed correctly). However, instead of thinking about that, you could also just type out the N-line solution more carefully to increase P. How much would P have to increase to yield the same chance of success as needing to type one fewer line of code? 6 | 7 | ## Constraints 8 | 1 ≤ T ≤ 100 9 | 2 ≤ N ≤ 1,000 10 | 1 ≤ P ≤ 99 11 | 12 | ## Input Format 13 | Input begins with an integer T, the number of test cases. Each case is a single line containing the integers N and P. 14 | 15 | ## Output Format 16 | For the ith test case, print "Case #i: " followed by how much higher P would need to be to make spending your time typing carefully be as successful as typing one line fewer with your original P. 17 | 18 | Your answer will be accepted if it is within an absolute or relative error of 10^-6. 19 | 20 | ## Sample Explanation 21 | In the first case, you initially need to type 2 lines. You can either type just 1 line with a 50% success rate, or you could improve your typing accuracy to sqrt(50%) ≈ 70.710678%, at which point you'd have a (sqrt(50%))^2 = 50% chance of successfully typing the original 2 lines. So you would need to increase P by 70.710678 − 50 = 20.710678 for both approaches to have an equal chance of success. -------------------------------------------------------------------------------- /2024/practice/line_by_line.out: -------------------------------------------------------------------------------- 1 | Case #1: 20.710678118654759 2 | Case #2: 11.544346900318834 3 | Case #3: 2.940819927087601 4 | Case #4: 0.005129467915044 5 | Case #5: 9.000000000000002 6 | Case #6: 12.142135623730949 7 | Case #7: 14.320508075688773 8 | Case #8: 16.000000000000000 9 | Case #9: 17.360679774997894 10 | Case #10: 2.467943448089638 11 | Case #11: 1.979589711327123 12 | Case #12: 1.488578017961051 13 | Case #13: 0.994949366116660 14 | Case #14: 0.498743710661997 15 | Case #15: 0.004615790278395 16 | Case #16: 0.007839369910818 17 | Case #17: 0.010538139191530 18 | Case #18: 0.012896247874723 19 | Case #19: 0.015001119818309 20 | Case #20: 0.004872987941551 21 | Case #21: 0.003918991463925 22 | Case #22: 0.002954588122994 23 | Case #23: 0.001979885316572 24 | Case #24: 0.000994988249470 25 | -------------------------------------------------------------------------------- /2024/practice/line_by_line_sample_input.txt: -------------------------------------------------------------------------------- 1 | 4 2 | 2 50 3 | 3 10 4 | 13 37 5 | 950 95 6 | -------------------------------------------------------------------------------- /2024/practice/line_by_line_sample_output.txt: -------------------------------------------------------------------------------- 1 | Case #1: 20.710678118654748 2 | Case #2: 11.544346900318839 3 | Case #3: 2.940819927087601 4 | Case #4: 0.005129467915043762 5 | -------------------------------------------------------------------------------- /2024/practice/line_of_delivery_part_1.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | using namespace std; 19 | 20 | // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0200r0.html 21 | template class y_combinator_result { 22 | Fun fun_; 23 | public: 24 | template explicit y_combinator_result(T &&fun): fun_(std::forward(fun)) {} 25 | template decltype(auto) operator()(Args &&...args) { return fun_(std::ref(*this), std::forward(args)...); } 26 | }; 27 | template decltype(auto) y_combinator(Fun &&fun) { return y_combinator_result>(std::forward(fun)); } 28 | 29 | 30 | template ostream& operator<<(ostream &os, const pair &p) { return os << '(' << p.first << ", " << p.second << ')'; } 31 | template ostream& operator<<(ostream& os, const tuple& t) { os << '('; apply([&os](const Args&... args) { size_t n = 0; ((os << args << (++n != sizeof...(Args) ? ", " : "")), ...); }, t); return os << ')'; } 32 | template::value, typename T_container::value_type>::type> ostream& operator<<(ostream &os, const T_container &v) { os << '{'; string sep; for (const T &x : v) os << sep << x, sep = ", "; return os << '}'; } 33 | 34 | void dbg_out() { cerr << endl; } 35 | template void dbg_out(Head H, Tail... T) { cerr << ' ' << H; dbg_out(T...); } 36 | #ifdef NEAL_DEBUG 37 | #define dbg(...) cerr << '[' << __FILE__ << ':' << __LINE__ << "] (" << #__VA_ARGS__ << "):", dbg_out(__VA_ARGS__) 38 | #else 39 | #define dbg(...) 40 | #endif 41 | 42 | 43 | void run_case(int test_case) { 44 | int N, G; 45 | cin >> N >> G; 46 | vector E(N); 47 | 48 | for (auto &e : E) 49 | cin >> e; 50 | 51 | sort(E.begin(), E.end()); 52 | int index = 0; 53 | 54 | for (int i = 1; i < N; i++) 55 | if (abs(E[i] - G) <= abs(E[index] - G)) 56 | index = i; 57 | 58 | cout << "Case #" << test_case << ": " << N - index << ' ' << abs(E[index] - G) << '\n'; 59 | } 60 | 61 | int main() { 62 | int tests; 63 | cin >> tests; 64 | 65 | for (int tc = 1; tc <= tests; tc++) { 66 | run_case(tc); 67 | cout << flush; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /2024/practice/line_of_delivery_part_1.md: -------------------------------------------------------------------------------- 1 | # Problem D1: Line of Delivery (Part 1) 2 | 3 | This problem shares some similarities with problem D2, with key differences in **bold**. 4 | 5 | Candice is playing a solitaire game of curling on a 1-dimensional sheet of ice, **using stones of negligible size**. She will throw N stones (numbered 1 to N) from position 0, targeting a position G units to the right. In curling, though we say a stone is “thrown”, it’s actually slid along the ice. 6 | 7 | The i-th stone will be thrown with energy E_i, and will travel E_i units to the right unless it collides with another stone, in which case it will transfer its remaining energy to the stone it hits. Formally, we repeat this process until all stones are stationary: 8 | 9 | - **If the moving stone is at the same position as a stationary stone, the moving stone stops, and the stone it hits is now the moving stone with the remaining energy of the previous moving stone.** 10 | - Otherwise, the moving stone moves 1 unit to the right and its energy is reduced by 1. If the moving stone now has energy 0, it becomes stationary. 11 | 12 | After all of the stones are thrown, which stone is closest to the goal position G, and how far away from the goal is it? 13 | 14 | ## Constraints 15 | 16 | - 1 ≤ T ≤ 85 17 | - 1 ≤ N ≤ 300,000 18 | - 1 ≤ E_i, G ≤ 1,000,000 19 | - In each test case, no two stones are thrown with the same energy. 20 | - The sum of N across all test cases is at most 2,000,000. 21 | 22 | ## Input Format 23 | 24 | Input begins with an integer T, the number of test cases. Each case starts with a line that contains the integers N and G. Then N lines follow, the i-th of which contains E_i. 25 | 26 | ## Output Format 27 | 28 | For the i-th test case, print "Case #i: " followed by the index of the stone that ends up closest to the goal, G, and how far away it is from G. If there’s a tie, output the stone with the lowest index. 29 | 30 | ## Sample Explanation 31 | 32 | In the first case, no stones collide. They end up at positions 7 and 2 respectively, so the first stone is the closest to the goal (5) and is 2 units away. 33 | 34 | The second case is depicted below. The third stone collides with the second stone, stopping at position 5, while the second stone then continues on to position 7. The final positions of the stones in order are [9, 7, 5]. The third stone is the closest to the goal (1) and ends up 4 units away. 35 | 36 | ![Line of Delivery](line_of_delivery_part_1.png) 37 | 38 | In the third case, the final positions of the stones are [9, 8, 7, 6], so the third stone ends up exactly on the goal. 39 | 40 | In the fourth case, both stones are equally distant from the goal. We break ties by picking the lower index. -------------------------------------------------------------------------------- /2024/practice/line_of_delivery_part_1.out: -------------------------------------------------------------------------------- 1 | Case #1: 1 2 2 | Case #2: 3 4 3 | Case #3: 3 0 4 | Case #4: 1 5 5 | Case #5: 6 0 6 | Case #6: 3 5 7 | Case #7: 2 5 8 | Case #8: 1 5 9 | Case #9: 1 150 10 | Case #10: 5 9 11 | Case #11: 56345 1 12 | -------------------------------------------------------------------------------- /2024/practice/line_of_delivery_part_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wandb/aihackercup/0ad5cde4d5bb64572bdb8373296e87392995e5f7/2024/practice/line_of_delivery_part_1.png -------------------------------------------------------------------------------- /2024/practice/line_of_delivery_part_1_sample_input.txt: -------------------------------------------------------------------------------- 1 | 4 2 | 2 5 3 | 7 4 | 2 5 | 3 1 6 | 9 7 | 5 8 | 7 9 | 4 7 10 | 8 11 | 7 12 | 9 13 | 6 14 | 2 10 15 | 15 16 | 5 17 | -------------------------------------------------------------------------------- /2024/practice/line_of_delivery_part_1_sample_output.txt: -------------------------------------------------------------------------------- 1 | Case #1: 1 2 2 | Case #2: 3 4 3 | Case #3: 3 0 4 | Case #4: 1 5 5 | -------------------------------------------------------------------------------- /2024/practice/line_of_delivery_part_2.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | using namespace std; 19 | 20 | // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0200r0.html 21 | template class y_combinator_result { 22 | Fun fun_; 23 | public: 24 | template explicit y_combinator_result(T &&fun): fun_(std::forward(fun)) {} 25 | template decltype(auto) operator()(Args &&...args) { return fun_(std::ref(*this), std::forward(args)...); } 26 | }; 27 | template decltype(auto) y_combinator(Fun &&fun) { return y_combinator_result>(std::forward(fun)); } 28 | 29 | 30 | template ostream& operator<<(ostream &os, const pair &p) { return os << '(' << p.first << ", " << p.second << ')'; } 31 | template ostream& operator<<(ostream& os, const tuple& t) { os << '('; apply([&os](const Args&... args) { size_t n = 0; ((os << args << (++n != sizeof...(Args) ? ", " : "")), ...); }, t); return os << ')'; } 32 | template::value, typename T_container::value_type>::type> ostream& operator<<(ostream &os, const T_container &v) { os << '{'; string sep; for (const T &x : v) os << sep << x, sep = ", "; return os << '}'; } 33 | 34 | void dbg_out() { cerr << endl; } 35 | template void dbg_out(Head H, Tail... T) { cerr << ' ' << H; dbg_out(T...); } 36 | #ifdef NEAL_DEBUG 37 | #define dbg(...) cerr << '[' << __FILE__ << ':' << __LINE__ << "] (" << #__VA_ARGS__ << "):", dbg_out(__VA_ARGS__) 38 | #else 39 | #define dbg(...) 40 | #endif 41 | 42 | // Note that T must be able to handle sums of values, not just individual values. 43 | using T = int64_t; 44 | 45 | struct splay_change { 46 | T to_add; 47 | 48 | splay_change(T _to_add = 0) : to_add(_to_add) {} 49 | 50 | bool has_change() const { 51 | return to_add != 0; 52 | } 53 | 54 | // Return the combined result of applying this splay_change followed by `other`. 55 | splay_change combine(const splay_change &other) const { 56 | return splay_change(to_add + other.to_add); 57 | } 58 | }; 59 | 60 | struct splay_node { 61 | splay_node *parent = nullptr, *child[2] = {nullptr, nullptr}; 62 | int size = 1; 63 | 64 | T value, maximum, sum; 65 | splay_change change; 66 | 67 | friend int get_size(splay_node *x) { 68 | return x == nullptr ? 0 : x->size; 69 | } 70 | 71 | friend T get_max(splay_node *x) { 72 | return x == nullptr ? numeric_limits::lowest() : x->maximum; 73 | } 74 | 75 | friend T get_sum(splay_node *x) { 76 | return x == nullptr ? 0 : x->sum; 77 | } 78 | 79 | int parent_index() const { 80 | return parent == nullptr ? -1 : int(this == parent->child[1]); 81 | } 82 | 83 | void set_child(int index, splay_node *x) { 84 | child[index] = x; 85 | 86 | if (x != nullptr) 87 | x->parent = this; 88 | } 89 | 90 | void apply_and_combine(const splay_change &now) { 91 | value += now.to_add; 92 | sum += size * now.to_add; 93 | maximum += now.to_add; 94 | 95 | change = change.combine(now); 96 | } 97 | 98 | void push() { 99 | if (change.has_change()) { 100 | if (child[0] != nullptr) child[0]->apply_and_combine(change); 101 | if (child[1] != nullptr) child[1]->apply_and_combine(change); 102 | change = splay_change(); 103 | } 104 | } 105 | 106 | void join() { 107 | size = get_size(child[0]) + get_size(child[1]) + 1; 108 | // TODO: Add other operations here. Remember to store my_value in addition to, e.g., min_value or max_value. 109 | sum = value + get_sum(child[0]) + get_sum(child[1]); 110 | maximum = max({value, get_max(child[0]), get_max(child[1])}); 111 | } 112 | }; 113 | 114 | int64_t splay_count = 0; 115 | 116 | struct splay_tree { 117 | static const int POOL_SIZE = 10000; 118 | static vector node_pool; 119 | static vector pointers_to_delete; 120 | 121 | static splay_node* new_node(const T &value) { 122 | if (node_pool.empty()) { 123 | splay_node *ptr = new splay_node[POOL_SIZE]; 124 | pointers_to_delete.push_back(ptr); 125 | node_pool.reserve(POOL_SIZE); 126 | 127 | for (int i = POOL_SIZE - 1; i >= 0; i--) 128 | node_pool.push_back(ptr + i); 129 | } 130 | 131 | splay_node *node = node_pool.back(); 132 | node_pool.pop_back(); 133 | *node = splay_node(); 134 | node->value = value; 135 | // TODO: add more logic for creating a new node here. 136 | node->join(); 137 | return node; 138 | } 139 | 140 | static bool _exit_delete_setup; 141 | 142 | static void _delete_pointers() { 143 | for (splay_node *node : pointers_to_delete) 144 | delete[] node; 145 | 146 | pointers_to_delete.clear(); 147 | } 148 | 149 | ~splay_tree() { 150 | if (!_exit_delete_setup) { 151 | atexit(_delete_pointers); 152 | _exit_delete_setup = true; 153 | } 154 | } 155 | 156 | splay_node *root = nullptr; 157 | 158 | splay_tree(const vector &values = {}) { 159 | init(values); 160 | } 161 | 162 | splay_tree(splay_node *node) { 163 | set_root(node); 164 | } 165 | 166 | splay_node *construct(const vector &values, int start, int end) { 167 | if (start >= end) 168 | return nullptr; 169 | 170 | if (end - start == 1) 171 | return new_node(values[start]); 172 | 173 | int mid = (start + end) / 2; 174 | splay_node *current = new_node(values[mid]); 175 | current->set_child(0, construct(values, start, mid)); 176 | current->set_child(1, construct(values, mid + 1, end)); 177 | current->join(); 178 | return current; 179 | } 180 | 181 | // Constructs the splay tree in linear time if the values are already sorted. 182 | void init(vector values) { 183 | set_root(construct(values, 0, int(values.size()))); 184 | } 185 | 186 | bool empty() const { 187 | return root == nullptr; 188 | } 189 | 190 | int size() const { 191 | return get_size(root); 192 | } 193 | 194 | splay_node *set_root(splay_node *x) { 195 | if (x != nullptr) 196 | x->parent = nullptr; 197 | 198 | return root = x; 199 | } 200 | 201 | void rotate_up(splay_node *x, bool x_join = true) { 202 | splay_node *p = x->parent, *gp = p->parent; 203 | int index = x->parent_index(); 204 | 205 | if (gp == nullptr) 206 | set_root(x); 207 | else 208 | gp->set_child(p->parent_index(), x); 209 | 210 | p->set_child(index, x->child[!index]); 211 | x->set_child(!index, p); 212 | p->join(); 213 | 214 | if (x_join) 215 | x->join(); 216 | } 217 | 218 | // Note that splay(x) handles both pushing x and joining all nodes from x to the root, inclusive. 219 | void splay(splay_node *x) { 220 | splay_count++; 221 | x->push(); 222 | 223 | while (x != root) { 224 | splay_node *p = x->parent; 225 | 226 | if (p != root) 227 | rotate_up(x->parent_index() == p->parent_index() ? p : x, false); 228 | 229 | rotate_up(x, false); 230 | } 231 | 232 | x->join(); 233 | } 234 | 235 | splay_node *node_at_index(int index) { 236 | if (index < 0 || index >= size()) 237 | return nullptr; 238 | 239 | splay_node *current = root; 240 | 241 | while (current != nullptr) { 242 | current->push(); 243 | int left_size = get_size(current->child[0]); 244 | 245 | if (index == left_size) { 246 | splay(current); 247 | return current; 248 | } 249 | 250 | if (index < left_size) { 251 | current = current->child[0]; 252 | } else { 253 | current = current->child[1]; 254 | index -= left_size + 1; 255 | } 256 | } 257 | 258 | assert(false); 259 | } 260 | 261 | splay_node *insert(int index, const T &value) { 262 | return insert(index, new_node(value)); 263 | } 264 | 265 | splay_node *insert(int index, splay_node *node) { 266 | assert(0 <= index && index <= size()); 267 | 268 | if (node == nullptr) 269 | return nullptr; 270 | else if (root == nullptr) 271 | return set_root(node); 272 | 273 | splay_node *current = root, *previous = nullptr; 274 | int previous_dir = -1; 275 | 276 | while (current != nullptr) { 277 | current->push(); 278 | previous = current; 279 | int left_size = get_size(current->child[0]); 280 | 281 | if (index <= left_size) { 282 | current = current->child[0]; 283 | previous_dir = 0; 284 | } else { 285 | current = current->child[1]; 286 | previous_dir = 1; 287 | index -= left_size + 1; 288 | } 289 | } 290 | 291 | previous->set_child(previous_dir, node); 292 | splay(node); 293 | return node; 294 | } 295 | 296 | splay_node *begin() { 297 | if (root == nullptr) 298 | return nullptr; 299 | 300 | splay_node *x = root; 301 | x->push(); 302 | 303 | while (x->child[0] != nullptr) { 304 | x = x->child[0]; 305 | x->push(); 306 | } 307 | 308 | splay(x); 309 | return x; 310 | } 311 | 312 | // To iterate through all nodes in order: 313 | // for (splay_node *node = tree.begin(); node != nullptr; node = tree.successor(node)) 314 | splay_node *successor(splay_node *x) const { 315 | if (x == nullptr) 316 | return nullptr; 317 | 318 | x->push(); 319 | 320 | if (x->child[1] != nullptr) { 321 | x = x->child[1]; 322 | x->push(); 323 | 324 | while (x->child[0] != nullptr) { 325 | x = x->child[0]; 326 | x->push(); 327 | } 328 | 329 | return x; 330 | } 331 | 332 | while (x->parent_index() == 1) 333 | x = x->parent; 334 | 335 | return x->parent; 336 | } 337 | 338 | splay_node *predecessor(splay_node *x) const { 339 | if (x == nullptr) 340 | return nullptr; 341 | 342 | x->push(); 343 | 344 | if (x->child[0] != nullptr) { 345 | x = x->child[0]; 346 | x->push(); 347 | 348 | while (x->child[1] != nullptr) { 349 | x = x->child[1]; 350 | x->push(); 351 | } 352 | 353 | return x; 354 | } 355 | 356 | while (x->parent_index() == 0) 357 | x = x->parent; 358 | 359 | return x->parent; 360 | } 361 | 362 | splay_node *last() { 363 | if (root == nullptr) 364 | return nullptr; 365 | 366 | splay_node *x = root; 367 | x->push(); 368 | 369 | while (x->child[1] != nullptr) { 370 | x = x->child[1]; 371 | x->push(); 372 | } 373 | 374 | splay(x); 375 | return x; 376 | } 377 | 378 | void clear() { 379 | vector nodes; 380 | nodes.reserve(size()); 381 | 382 | for (splay_node *node = begin(); node != nullptr; node = successor(node)) 383 | nodes.push_back(node); 384 | 385 | for (splay_node *node : nodes) { 386 | // Instead of deleting, add `node` back to `node_pool`. 387 | *node = splay_node(); 388 | node_pool.push_back(node); 389 | } 390 | 391 | set_root(nullptr); 392 | } 393 | 394 | void erase(splay_node *x) { 395 | splay(x); 396 | 397 | if (x->child[0] == nullptr || x->child[1] == nullptr) { 398 | set_root(x->child[int(x->child[0] == nullptr)]); 399 | } else { 400 | set_root(x->child[0]); 401 | insert(size(), x->child[1]); 402 | } 403 | 404 | // Instead of deleting, add `x` back to `node_pool`. 405 | *x = splay_node(); 406 | node_pool.push_back(x); 407 | } 408 | 409 | // Detach x from its parent, producing two separate splay trees as a result. 410 | void detach(splay_node *x) { 411 | if (x == nullptr) 412 | return; 413 | 414 | if (x == root) { 415 | set_root(nullptr); 416 | return; 417 | } 418 | 419 | splay_node *parent = x->parent; 420 | assert(parent != nullptr); 421 | parent->set_child(x->parent_index(), nullptr); 422 | x->parent = nullptr; 423 | splay(parent); 424 | x->push(); 425 | } 426 | 427 | // Returns a splay_node pointer representing the first `count` nodes. If none, returns `nullptr`. 428 | splay_node *query_prefix_count(int count) { 429 | if (count <= 0) 430 | return nullptr; 431 | else if (count >= size()) 432 | return root; 433 | 434 | splay_node *node = node_at_index(count); 435 | splay(node); 436 | return node->child[0]; 437 | } 438 | 439 | // Returns a splay_node pointer representing the last `count` nodes. If none, returns `nullptr`. 440 | splay_node *query_suffix_count(int count) { 441 | if (count <= 0) 442 | return nullptr; 443 | else if (count >= size()) 444 | return root; 445 | 446 | int index = size() - count; 447 | splay_node *node = node_at_index(index - 1); 448 | splay(node); 449 | return node->child[1]; 450 | } 451 | 452 | // Returns a splay_node pointer representing the index range [start, end). If none, returns `nullptr`. 453 | splay_node *query_range(int start, int end) { 454 | if (start >= end) 455 | return nullptr; 456 | else if (start <= 0) 457 | return query_prefix_count(end); 458 | else if (end >= size()) 459 | return query_suffix_count(size() - start); 460 | 461 | splay_node *before = node_at_index(start - 1); 462 | splay_node *after = node_at_index(end); 463 | splay(after); 464 | splay(before); 465 | 466 | if (after->parent != before) 467 | rotate_up(after); 468 | 469 | assert(before->child[1] == after); 470 | return after->child[0]; 471 | } 472 | 473 | // Applies an update to the subtree rooted at `node`. 474 | void update(splay_node *node, const splay_change &change) { 475 | if (node == nullptr) 476 | return; 477 | 478 | node->apply_and_combine(change); 479 | splay(node); 480 | } 481 | 482 | // Finds the end of the last subarray starting at `first` satisfying `should_join` via binary search. 483 | // should_join(splay_node *node, bool single_node) -> bool 484 | // Determines whether we should join with a node (if single_node then just the node, else the subtree). 485 | // If true, actually performs the join. 486 | template 487 | int find_last_subarray(T_bool &&should_join, int first = 0) { 488 | if (!should_join(nullptr, false)) 489 | return first - 1; 490 | 491 | splay_node *current = first == 0 ? root : query_suffix_count(size() - first); 492 | splay_node *previous = nullptr; 493 | int end = first; 494 | 495 | while (current != nullptr) { 496 | current->push(); 497 | previous = current; 498 | 499 | if (!should_join(current->child[0], false)) { 500 | current = current->child[0]; 501 | } else { 502 | end += get_size(current->child[0]); 503 | 504 | if (!should_join(current, true)) 505 | break; 506 | 507 | end++; 508 | current = current->child[1]; 509 | } 510 | } 511 | 512 | if (previous != nullptr) 513 | splay(previous); 514 | 515 | return end; 516 | } 517 | }; 518 | 519 | bool splay_tree::_exit_delete_setup = false; 520 | vector splay_tree::node_pool; 521 | vector splay_tree::pointers_to_delete; 522 | 523 | 524 | const int EG_MAX = int(2e6) + 5; 525 | 526 | void run_case(int test_case) { 527 | int N, G; 528 | cin >> N >> G; 529 | vector E(N); 530 | 531 | for (auto &e : E) 532 | cin >> e; 533 | 534 | splay_tree tree; 535 | 536 | for (auto &e : E) { 537 | int low = 0, high = EG_MAX; 538 | 539 | while (low < high) { 540 | int mid = low + (high - low + 1) / 2; 541 | 542 | int below = tree.find_last_subarray([&](splay_node *node, bool single_node) -> bool { 543 | return (single_node ? node->value : get_max(node)) < mid; 544 | }); 545 | 546 | if (mid - below <= e) 547 | low = mid; 548 | else 549 | high = mid - 1; 550 | } 551 | 552 | int below = tree.find_last_subarray([&](splay_node *node, bool single_node) -> bool { 553 | return (single_node ? node->value : get_max(node)) < low; 554 | }); 555 | tree.update(tree.query_prefix_count(below), splay_change(-1)); 556 | tree.insert(below, low); 557 | } 558 | 559 | int closest = 2 * EG_MAX, best_index = -1; 560 | int index = 0; 561 | 562 | for (splay_node *node = tree.begin(); node != nullptr; node = tree.successor(node), index++) { 563 | int dist = abs(int(node->value) - G); 564 | 565 | if (dist <= closest) { 566 | closest = dist; 567 | best_index = index; 568 | } 569 | } 570 | 571 | cout << "Case #" << test_case << ": " << N - best_index << ' ' << closest << '\n'; 572 | } 573 | 574 | int main() { 575 | int tests; 576 | cin >> tests; 577 | 578 | for (int tc = 1; tc <= tests; tc++) { 579 | run_case(tc); 580 | cout << flush; 581 | } 582 | } 583 | -------------------------------------------------------------------------------- /2024/practice/line_of_delivery_part_2.md: -------------------------------------------------------------------------------- 1 | # Problem D2: Line of Delivery (Part 2) 2 | 3 | This problem shares some similarities with problem D1, with key differences in **bold**. 4 | 5 | Candice is playing a solitaire game of curling on a 1-dimensional sheet of ice, **using stones that are 1 unit wide**. She will throw N stones (numbered 1 to N) from position 0, targeting a position G units to the right. In curling, though we say a stone is “thrown”, it’s actually slid along the ice. 6 | 7 | The i-th stone will be thrown with energy Ei, and will travel Ei units to the right unless it collides with another stone, in which case it will transfer its remaining energy to the stone it hits. Formally, we repeat this process until all stones are stationary: 8 | 9 | - **If the moving stone is at position p and there is a stationary stone at position p+1, the moving stone stops at position p, and the stone at position p+1 is now the moving stone with the remaining energy of the previous moving stone**. 10 | - Otherwise, the moving stone moves 1 unit to the right and its energy is reduced by 1. If the moving stone now has energy 0, it becomes stationary. 11 | 12 | After all of the stones are thrown, which stone is closest to the goal position G, and how far away from the goal is it? 13 | 14 | ## Constraints 15 | - 1 ≤ T ≤ 80 16 | - 1 ≤ N ≤ 300,000 17 | - 1 ≤ G ≤ 1,000,000 18 | - N ≤ Ei ≤ 1,000,000 19 | 20 | **All stones are thrown with energy Ei ≥ N, so that stones do not pile up near Candice, but the energies are not necessarily unique.** 21 | 22 | The sum of N across all test cases is at most 2,000,000. 23 | 24 | ## Input Format 25 | Input begins with an integer T, the number of test cases. Each case starts with a line that contains the integers N and G. Then N lines follow, the ith of which contains Ei. 26 | 27 | ## Output Format 28 | 29 | For the i-th test case, print "Case #i: " followed by the index of the stone that ends up closest to the goal, G, and how far away it is from G. If there’s a tie, output the stone with the lowest index. 30 | 31 | ## Sample Explanation 32 | 33 | The third sample case is depicted below. The first stone stops at position 8, and the second stone stops at position 7. The third stone starts with an energy of 9, but stops at position 6, transferring 3 energy to the second stone. The second stone is already touching the first stone, so it transfers 3 energy to the first stone, which then moves to position 11. The fourth stone starts with energy 6, and stops at position 5, transferring 1 energy to the next stone, which again transfers 1 energy to the next stone, which then moves to position 8. So the final positions of the stones are [11, 8, 6, 5] with stone 2 being at position 8, the goal. 34 | 35 | ![line_of_delivery_part_2](line_of_delivery_part_2.png) 36 | -------------------------------------------------------------------------------- /2024/practice/line_of_delivery_part_2.out: -------------------------------------------------------------------------------- 1 | Case #1: 1 2 2 | Case #2: 2 2 3 | Case #3: 2 0 4 | Case #4: 1 5 5 | Case #5: 4 1 6 | Case #6: 10 1 7 | Case #7: 3 4 8 | Case #8: 3 6 9 | Case #9: 2 2 10 | Case #10: 1 148 11 | Case #11: 5 8 12 | Case #12: 20161 1 13 | -------------------------------------------------------------------------------- /2024/practice/line_of_delivery_part_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wandb/aihackercup/0ad5cde4d5bb64572bdb8373296e87392995e5f7/2024/practice/line_of_delivery_part_2.png -------------------------------------------------------------------------------- /2024/practice/line_of_delivery_part_2_sample_input.txt: -------------------------------------------------------------------------------- 1 | 5 2 | 2 5 3 | 7 4 | 2 5 | 3 6 6 | 9 7 | 5 8 | 7 9 | 4 8 10 | 8 11 | 7 12 | 9 13 | 6 14 | 2 10 15 | 15 16 | 5 17 | 4 4 18 | 8 19 | 5 20 | 8 21 | 5 22 | -------------------------------------------------------------------------------- /2024/practice/line_of_delivery_part_2_sample_output.txt: -------------------------------------------------------------------------------- 1 | Case #1: 1 2 2 | Case #2: 2 2 3 | Case #3: 2 0 4 | Case #4: 1 5 5 | Case #5: 4 1 6 | -------------------------------------------------------------------------------- /2024/practice/walk_the_line.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | using namespace std; 19 | 20 | // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0200r0.html 21 | template class y_combinator_result { 22 | Fun fun_; 23 | public: 24 | template explicit y_combinator_result(T &&fun): fun_(std::forward(fun)) {} 25 | template decltype(auto) operator()(Args &&...args) { return fun_(std::ref(*this), std::forward(args)...); } 26 | }; 27 | template decltype(auto) y_combinator(Fun &&fun) { return y_combinator_result>(std::forward(fun)); } 28 | 29 | 30 | template ostream& operator<<(ostream &os, const pair &p) { return os << '(' << p.first << ", " << p.second << ')'; } 31 | template ostream& operator<<(ostream& os, const tuple& t) { os << '('; apply([&os](const Args&... args) { size_t n = 0; ((os << args << (++n != sizeof...(Args) ? ", " : "")), ...); }, t); return os << ')'; } 32 | template::value, typename T_container::value_type>::type> ostream& operator<<(ostream &os, const T_container &v) { os << '{'; string sep; for (const T &x : v) os << sep << x, sep = ", "; return os << '}'; } 33 | 34 | void dbg_out() { cerr << endl; } 35 | template void dbg_out(Head H, Tail... T) { cerr << ' ' << H; dbg_out(T...); } 36 | #ifdef NEAL_DEBUG 37 | #define dbg(...) cerr << '[' << __FILE__ << ':' << __LINE__ << "] (" << #__VA_ARGS__ << "):", dbg_out(__VA_ARGS__) 38 | #else 39 | #define dbg(...) 40 | #endif 41 | 42 | 43 | void run_case(int test_case) { 44 | int N; 45 | int64_t K; 46 | cin >> N >> K; 47 | vector S(N); 48 | 49 | for (auto &s : S) 50 | cin >> s; 51 | 52 | sort(S.begin(), S.end()); 53 | int64_t need = max(2 * N - 3, 1) * S[0]; 54 | cout << "Case #" << test_case << ": " << (need <= K ? "YES" : "NO") << '\n'; 55 | } 56 | 57 | int main() { 58 | int tests; 59 | cin >> tests; 60 | 61 | for (int tc = 1; tc <= tests; tc++) { 62 | run_case(tc); 63 | cout << flush; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /2024/practice/walk_the_line.in: -------------------------------------------------------------------------------- 1 | 14 2 | 4 17 3 | 1 4 | 2 5 | 5 6 | 10 7 | 4 4 8 | 1 9 | 2 10 | 5 11 | 10 12 | 2 22 13 | 22 14 | 22 15 | 3 1000000000 16 | 1000000000 17 | 1000000000 18 | 1000000000 19 | 1 10 20 | 12 21 | 1 100 22 | 12 23 | 1 10 24 | 10 25 | 1 10 26 | 9 27 | 1 10 28 | 11 29 | 2 10 30 | 10 31 | 5 32 | 2 10 33 | 10 34 | 15 35 | 5 100 36 | 15 37 | 14 38 | 20 39 | 28 40 | 16 41 | 5 100 42 | 16 43 | 15 44 | 20 45 | 28 46 | 17 47 | 5 987654321 48 | 345678912 49 | 456789123 50 | 567891234 51 | 678912345 52 | 789123456 53 | -------------------------------------------------------------------------------- /2024/practice/walk_the_line.md: -------------------------------------------------------------------------------- 1 | # Problem A: Walk the Line 2 | 3 | There’s an old, narrow bridge that a group of N travelers wish to cross during the night. The bridge can only support the weight of at most 2 people at a time. The travelers must stay together and use the group’s only flashlight while on the bridge. Each traveler i can cross the bridge in Si seconds alone. 4 | 5 | Thankfully, the group has a very lightweight wheelbarrow. There are two possible scenarios for crossing: 6 | 7 | 1. Traveler i can cross the bridge alone in Si seconds, optionally bringing the wheelbarrow. 8 | 2. Two travelers i and j can both cross together in Si seconds if traveler j rides in the wheelbarrow pushed by traveler i. 9 | 10 | In both cases, any group crossing the bridge must bring the flashlight. The flashlight can be returned to the initial side by the same rules mentioned above. The task is to determine if there is a strategy for all travelers to cross the bridge within K seconds. 11 | 12 | ## Constraints 13 | 14 | • 1 ≤ T ≤ 95 (Number of test cases) 15 | • 1 ≤ N ≤ 1,000 (Number of travelers) 16 | • 1 ≤ Si, K ≤ 1,000,000,000 (Time taken by each traveler and maximum allowed time) 17 | 18 | ## Input Format 19 | 20 | • The input begins with an integer T, the number of test cases. 21 | • Each test case starts with a line containing two integers N and K. 22 | • The next N lines contain one integer each, representing the time Si it takes for the i-th traveler to cross the bridge alone. 23 | 24 | ## Output Format 25 | 26 | For each test case, print Case #i: YES if the travelers can all cross the bridge within K seconds, or Case #i: NO if they cannot. 27 | 28 | ## Sample Explanation 29 | 30 | Here’s a possible strategy for the first case. Traveler 3 can carry traveler 4 across, and then return alone. Then traveler 2 can carry traveler 3 across, and then return alone. Then traveler 1 can carry traveler 2 across. This takes 5 + 5 + 2 + 2 + 1 = 15 seconds. 31 | In the second case, there is no strategy that gets all 4 travelers across within 4 seconds. 32 | In the third case, both travelers can cross in exactly the 22 allotted seconds if they travel together. -------------------------------------------------------------------------------- /2024/practice/walk_the_line.out: -------------------------------------------------------------------------------- 1 | Case #1: YES 2 | Case #2: NO 3 | Case #3: YES 4 | Case #4: NO 5 | Case #5: NO 6 | Case #6: YES 7 | Case #7: YES 8 | Case #8: YES 9 | Case #9: NO 10 | Case #10: YES 11 | Case #11: YES 12 | Case #12: YES 13 | Case #13: NO 14 | Case #14: NO 15 | -------------------------------------------------------------------------------- /2024/practice/walk_the_line_sample_input.txt: -------------------------------------------------------------------------------- 1 | 6 2 | 4 17 3 | 1 4 | 2 5 | 5 6 | 10 7 | 4 4 8 | 1 9 | 2 10 | 5 11 | 10 12 | 2 22 13 | 22 14 | 22 15 | 3 1000000000 16 | 1000000000 17 | 1000000000 18 | 1000000000 19 | 1 10 20 | 12 21 | 1 100 22 | 12 23 | -------------------------------------------------------------------------------- /2024/practice/walk_the_line_sample_output.txt: -------------------------------------------------------------------------------- 1 | Case #1: YES 2 | Case #2: NO 3 | Case #3: YES 4 | Case #4: NO 5 | Case #5: NO 6 | Case #6: YES 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | Weights & Biases 3 | Weights & Biases 4 |

5 | 6 | # ⚡️ Competition - AI Hacker Cup 7 | 8 | 9 | [Weights & Biases](https://wandb.ai/site?utm_source=github&utm_medium=code&utm_campaign=lightning-ai-hacker-cup) are running a 7-day Lightning Competition focussed on using LLMs to solve the 2023 practice problems for the [2024 NeurIPS AI Hacker Cup](https://hackercupai.github.io/) challenge. The competition involves solving very challenging logic problems using code. 10 | 11 | #### Goal 12 | The goal is to try and solve all 5 of the 2023 practice questions for the AI Hacker Cup using MistralAI's models. We’re offering free MistralAI api access via the code in this colab to get people started. For context, the **[starter notebook](https://github.com/wandb/aihackercup/blob/main/rag_code_agent.ipynb)** included in this repo has free MistralAI api access and can consistently get 2 out of 5 solutions correct using `mistral-large-latest`. 13 | 14 | ### Deadline 15 | 16 | The deadline to submit winning solutions is Monday October 16th , 8am PT / 5pm CET. 17 | 18 | ### Prizes 19 | 20 | Weights & Biases are giving away a pair of Meta Ray-Ban Smart Glasses for the first individual to submit code that solves: 21 | - 3 out of 5 correct solutions 22 | - 4 out of 5 correct solutions 23 | - 5 out of 5 correct solutions 24 | 25 | (i.e. in total 3 pairs of sunglasses to give away) 26 | 27 | ## Getting Started 28 | 29 | **Starter Notebook** 30 | 31 | We have included a **[starter notebook](https://github.com/wandb/aihackercup/blob/main/rag_code_agent.ipynb)** which includes free MistralAI api access and which can consistently solve 2 out of 5 problems. This **[AI Hacker Cup lecture video](https://www.youtube.com/watch?v=cObBj2UpWK8)** includes an explanation of the approach taken, see the Resources section 32 | 33 | **Discord** 34 | 35 | The official 2024 NeurIPS Hacker Cup AI [discord is here](https://discord.gg/NkDxUd43Wf) and has a channel called `#lighting-comp-practice-problems` for discussion about this particular 7-day competition. 36 | 37 | ## Submissions 38 | 39 | To submit code for verification you neeed to submit the following to the **[Submissions Form](https://forms.gle/5t3SgaxR11FhGAGX6)**: 40 | 41 | - a zipped directory with a README and a requirements.txt 42 | - a link to the [W&B Weave](https://weave-docs.wandb.ai/tutorial-eval?utm_source=github&utm_medium=code&utm_campaign=lightning-ai-hacker-cup) evaluation 43 | 44 | You can use your own code to generate solutions or you can modify the code in this repo. The one requirement is that evaluations must be run using W&B Weave as we'll be using those logs to help verify winning solutions. 45 | 46 | ## Rules 47 | 48 | *Note these are the rules for the W&B Lightning Competition, not the official NeurIPS AI Hacker Cup challenge* 49 | 50 | **Generalizability** 51 | 52 | The goal of this Lightning competition is to create solutions that are generalizable to this entire class of coding-based problem solving. As the solutions to the competition problems are available online, no code or specific references or descriptions of the solutions to these practice questions are permitted in the prompts or datasets used. W&B decisions are final. 53 | 54 | **Open Source** 55 | 56 | All winning submissions code can and will be open sourced by W&B to be eligible for a prize. Winning solutions will be open sourced immediately after verification, eg if someone is the first to get to 4 out of 5 solutions correct and W&B verify and award a prize, we’ll open source that code as soon as we can, which can be during the comeptition, so that others can build on top of it to get to 5 out of 5. 57 | 58 | **Reproducibility** 59 | 60 | Solutions must be reproducible, runnable code must be submitted and verified by W&B team. Given the non-deterministic nature of LLM outputs, solutions must cross the prize winning threshold in at least 2 out of 3 trials. i.e. if you submit for first to 3 out of 5 solved, your codebase must hit this performance twice out of three possible attempts. 61 | 62 | **Weave Evaluations** 63 | 64 | Evaluations must be run using [W&B Weave](https://weave-docs.wandb.ai/tutorial-eval?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup) and a link to those evaluations included in the submission. 65 | 66 | **No fine-tuning** 67 | 68 | This quick competition isn't focussed on fine-tuning models, only the vanilla MistralAI models available via the MistralAI api can be used. You can use the local versions of these models. 69 | 70 | **Time Limit** 71 | 72 | There will be a 20 minute time limit to generate solutions for all 5 problems, the MistraAI api will be used when running the submitted code. 73 | 74 | **One prize per individual** 75 | 76 | An individual can only win 1 single prize. i.e. if you are the first to solve 4 out of 5 challenges you are not eligible to win a second pair of Ray-Bans. Working in teams are allowed but there is only 1 pair of Ray-Ban per prize category, i.e. you'll have to figure out how to divide 1 pair of sunglasses among 2+ people. 77 | 78 | ## Resources 79 | 80 | This folder contains the implementation of a RAG agent to solve the Hacker Cup problems using LLMs. It includes a colab and code for downloading and preprocessing the datasets and generating solutions using a Retrieval Model. 81 | 82 | The RAG agent is based on a combination of a retriever and a generator model. 83 | The retriever is used to retrieve similar historical problems and solutions from 84 | the [codecontests](https://huggingface.co/datasets/deepmind/code_contests) dataset and prompt an LLM with few-shot 85 | examples to generate solutions for the current problem. 86 | 87 | You can learn more about the approach in this youtube video: 88 | 89 | 90 | 91 | 92 | 93 | ## Contents 94 | 95 | 1. `rag_code_agent.ipynb`: this notebook contains a full walkthrough of the RAG agent and how to use it to solve Hacker Cup 96 | problems. 97 | 2. `retriever.py`: this script contains the implementation of the retriever we used. 98 | 3. `agent.py`: this script contains the implementation of the agent we used to solve the problems. 99 | 4. `utils.py`: utility functions used in retrieving and generating solutions. 100 | 5. `requirements.txt`: list of required packages to run the code. 101 | 102 | 103 | 104 | ## Download Full Raw Dataset 105 | 106 | Alternatively, you can download the dataset by running the download script from the [submit-first-solution](https://github.com/HackerCupAI/starter-kits/tree/main/submit_first_solution). Specifically, you can run the following command to download the dataset: 107 | 108 | ```bash 109 | python download.py --year 2023 --dataset_folder data 110 | ``` 111 | 112 | 113 | This should create a `dataset` folder with the problems and solutions. Here's an example of what the data looks like for the `dim_sum_delivery` problem from the `2023` season: 114 | 115 | ``` 116 | data/dataset/2023/practice 117 | ... 118 | ├── dim_sum_delivery.cpp 119 | ├── dim_sum_delivery.in 120 | ├── dim_sum_delivery.md 121 | ├── dim_sum_delivery.out 122 | ├── dim_sum_delivery_sample_input.txt 123 | ├── dim_sum_delivery_sample_output.txt 124 | ├── dim_sum_delivery_sol.md 125 | ... 126 | ``` 127 | 128 | Each problem has a `in`, `out`, `md`, `cpp`, and `sol` file. 129 | 130 | The `in` file contains the input data for the problem. 131 | The `out` file contains the expected output for the problem. 132 | The `md` file contains the problem statement. 133 | The `cpp` file contains the source code to the solution. 134 | The `sol` file contains the detailed solution to the problem. 135 | The `sample_input.txt` and `sample_output.txt` files contain the sample input and output for the problem. These are the test cases that will be available to the agent during development and evaluation. 136 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wandb/aihackercup/0ad5cde4d5bb64572bdb8373296e87392995e5f7/__init__.py -------------------------------------------------------------------------------- /agent.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import os 4 | from typing import List 5 | 6 | import weave 7 | 8 | from retriever import Retriever, rerank_docs 9 | from utils import (FAST_LLM, STRONG_LLM, Analysis, Problem, Reflection, 10 | Solution, async_client, check_correctness, format_example, 11 | format_examples, format_response) 12 | 13 | logging.basicConfig( 14 | format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO 15 | ) 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | MAX_TOKENS = os.getenv("MAX_TOKENS", 4096) 20 | # BASE_URL = os.getenv("BASE_URL", None) 21 | 22 | SOLVER_INSTRUCTIONS = """You are a world-class competitive programmer tasked with solving a programming problem. 23 | You will be provided with a problem statement, and you need to create a Python3 solution for it. 24 | Your task it to develop a winning solution to the problem in Python3 programming language. 25 | You will do this in a step-by-step manner. 26 | 27 | Step 1: Extract the core question and the problem-solving information from the problem statement. 28 | Step 2: Describe the algorithm used to solve the problem. 29 | Step 3: Write a short tutorial on the algorithm and how it works. 30 | Step 4: Generate a step by step plan to solve the problem. 31 | Step 5: Generate the pseudocode to solve the problem. 32 | Step 6: Write the final solution in Python3 programming language to solve the problem. 33 | 34 | Competition Guidelines: 35 | a. Do not use any external libraries; stick to Python 3 standard library 36 | b. Handle input and output using standard input/output (stdin/stdout) 37 | c. Use helper functions to improve readability of the code. 38 | c. Use the `input()` function to take input from stdin and print the output to stdout. 39 | d. Do not add extra print statements otherwise it will fail the test cases. 40 | e. Make sure your code passes all potential test cases, including edge cases 41 | f. Follow the input/output format specified in the problem statement and the sample test cases. 42 | 43 | 44 | **Formatting Instructions: Your response must follow the following xml format** - 45 | 46 | 47 | 48 | [Extract core question, only the most comprehensive and detailed one!] 49 | 50 | 51 | [Extract problem-solving information related to the core question, only the most comprehensive and detailed one!] 52 | 53 | 54 | [Algorithm to solve the problem. Describe the algorithm used to solve the problem such that a novice programmer without any prior knowledge of the solution can implement it. Do not generate code.] 55 | 56 | 57 | [Write a useful tutorial about the above mentioned algorithm(s). Provide a high level generic tutorial for solving these types of problems. Do not generate code.] 58 | 59 | 60 | [Generate a step by step plan to solve the problem.] 61 | 62 | 63 | [Generate a pseudocode to solve the problem.] 64 | 65 | 66 | [Write the final solution in Python3 programming language to solve the problem.] 67 | 68 | 69 | 70 | --- 71 | """ 72 | 73 | 74 | @weave.op 75 | async def draft_solution( 76 | problem: Problem, model: str = FAST_LLM, temperature: float = 0.0 77 | ) -> Solution: 78 | user_prompt = f"""{problem.as_xml} 79 | --- 80 | Let's think step by step to solve the problem: 81 | """ 82 | 83 | response = await async_client.chat.completions.create( 84 | model=model, 85 | messages=[ 86 | {"role": "system", "content": SOLVER_INSTRUCTIONS}, 87 | {"role": "user", "content": user_prompt}, 88 | ], 89 | response_model=None, 90 | temperature=temperature, 91 | max_tokens=MAX_TOKENS, 92 | max_retries=2 93 | ) 94 | try: 95 | formatted_response = await format_response( 96 | text=response.choices[0].message.content, 97 | model=Solution, 98 | temperature=temperature 99 | ) 100 | assert isinstance(formatted_response, Solution), "Response must be a Solution object" 101 | return formatted_response 102 | except Exception as e: 103 | err_msg = f"Error formatting response: {e}" 104 | logger.error(err_msg) 105 | raise_in_weave(raise_error=True, msg=err_msg) 106 | return Solution( 107 | core_question=err_msg, 108 | problem_solving_info=[err_msg], 109 | algorithm=err_msg, 110 | tutorial=err_msg, 111 | plan=err_msg, 112 | pseudocode=err_msg, 113 | source_code=err_msg, 114 | ) 115 | 116 | 117 | ANALYSIS_INSTRUCTIONS = """You are an expert programming analyst with a deep understanding of competitive programming. 118 | You are provided with a problem statement and a solution to a problem. 119 | Your task is to develop a step by step plan and pseudocode to solve the problem. 120 | You will do this in a step by step manner. 121 | First, extract the core question and the problem-solving information from the problem statement. 122 | Then, describe the algorithm used to solve the problem. 123 | Then, write a short tutorial on the algorithm and how it works. 124 | Next, generate a step by step plan to solve the problem. 125 | Finally, generate the pseudocode to solve the problem. 126 | 127 | **Formatting Instructions: Your response must follow the following xml format** - 128 | 129 | 130 | 131 | [Extract core question, only the most comprehensive and detailed one!] 132 | 133 | 134 | [Extract problem-solving information related to the core question, only the most comprehensive and detailed one!] 135 | 136 | 137 | [Algorithm to solve the problem. Describe the algorithm used to solve the problem such that a novice programmer without any prior knowledge of the solution can implement it. Do not generate code.] 138 | 139 | 140 | [Write a useful tutorial about the above mentioned algorithm(s). Provide a high level generic tutorial for solving these types of problems. Do not generate code.] 141 | 142 | 143 | [Generate a step by step plan to solve the problem.] 144 | 145 | 146 | [Generate a pseudocode to solve the problem.] 147 | 148 | 149 | """ 150 | 151 | 152 | @weave.op 153 | async def analyze_and_plan(example: dict, temperature: float = 0.7) -> Analysis: 154 | user_prompt = f"""{format_example(example)} 155 | 156 | Let's think step by step to analyze the problem and plan a solution to the problem: 157 | """ 158 | response = await async_client.chat.completions.create( 159 | model=FAST_LLM, 160 | messages=[ 161 | {"role": "system", "content": ANALYSIS_INSTRUCTIONS}, 162 | {"role": "user", "content": user_prompt}, 163 | ], 164 | temperature=temperature, 165 | response_model=None, 166 | max_tokens=MAX_TOKENS, 167 | max_retries=2 168 | ) 169 | 170 | try: 171 | formatted_response = await format_response( 172 | text=response.choices[0].message.content, 173 | model=Analysis, 174 | temperature=temperature 175 | ) 176 | return formatted_response 177 | except Exception as e: 178 | err_msg = f"Error formatting response: {e}" 179 | logger.error(err_msg) 180 | raise_in_weave(raise_error=True, msg=err_msg) 181 | return Analysis( 182 | core_question=err_msg, 183 | problem_solving_info=[err_msg], 184 | algorithm=err_msg, 185 | tutorial=err_msg, 186 | plan=err_msg, 187 | pseudocode=err_msg, 188 | ) 189 | 190 | 191 | @weave.op 192 | async def analyze_and_plan_solutions(docs: List[dict], temperature: float = 0.7) -> List[Analysis]: 193 | ''' 194 | Create analysis for a list of solutions. 195 | ''' 196 | tasks = [] 197 | for doc in docs: 198 | tasks.append(analyze_and_plan(doc, temperature)) 199 | descriptions = await asyncio.gather(*tasks) 200 | return descriptions 201 | 202 | 203 | @weave.op 204 | async def generate_solution( 205 | problem: Problem, examples: str, model: str = STRONG_LLM, temperature: float = 0.0 206 | ) -> Solution: 207 | instructions_prompt = f"""{SOLVER_INSTRUCTIONS} 208 | 209 | You have previously solved the following problems in this competition: 210 | 211 | {examples} 212 | 213 | """ 214 | 215 | messages = [ 216 | {"role": "system", "content": instructions_prompt}, 217 | { 218 | "role": "user", 219 | "content": f"""{problem.as_xml} 220 | --- 221 | Let's think step by step to solve the problem: 222 | 223 | """, 224 | }, 225 | ] 226 | response = await async_client.chat.completions.create( 227 | model=model, 228 | messages=messages, 229 | response_model=None, 230 | temperature=temperature, 231 | max_tokens=MAX_TOKENS, 232 | max_retries=2 233 | ) 234 | try: 235 | formatted_response = await format_response( 236 | text=response.choices[0].message.content, 237 | model=Solution, 238 | temperature=temperature 239 | ) 240 | return formatted_response 241 | except Exception as e: 242 | err_msg = f"Error formatting response: {e}" 243 | logger.error(err_msg) 244 | raise_in_weave(raise_error=True, msg=err_msg) 245 | return Solution( 246 | core_question=err_msg, 247 | problem_solving_info=[err_msg], 248 | algorithm=err_msg, 249 | tutorial=err_msg, 250 | plan=err_msg, 251 | pseudocode=err_msg, 252 | source_code=err_msg, 253 | ) 254 | 255 | 256 | REFLECTION_INSTRUCTIONS = """You are a world-class competitive programmer with a keen eye for detail and problem solving. 257 | Your expertise is in algorithms and data structures. 258 | You have incorrectly answered the following programming problem. 259 | Your task is to reflect on the problem, your solution, and the correct answer. 260 | You will then use this information help you answer the same question in the future. 261 | First, explain why you answered the question incorrectly. 262 | Second, list the keywords that describe the type of your errors from most general to most specific. 263 | Third, solve the problem again, step-by-step, based on your knowledge of the correct answer. 264 | Fourth, create a list of detailed instructions to help you correctly solve this problem in the future. 265 | Finally, create a list of general advice to help you solve similar types of problems in the future. 266 | Be concise in your response; however, capture all of the essential information. 267 | 268 | {problem} 269 | 270 | {incorrect_solution} 271 | 272 | 273 | {test_report} 274 | 275 | 276 | **Format Instructions: Your response must follow the following xml format** - 277 | 278 | 279 | 280 | [Reflect on the problem, your solution, and the correct answer.] 281 | 282 | 283 | [List the keywords that describe the type of your errors from most general to most specific.] 284 | 285 | 286 | [Solve the problem again, step-by-step, based on your knowledge of the correct answer.] 287 | 288 | 289 | [Create a list of detailed instructions to help you correctly solve this problem in the future.] 290 | 291 | 292 | [Create a list of general advice to help you solve similar types of problems in the future.] 293 | 294 | 295 | --- 296 | Let's think step by step to reflect on the problem: 297 | """ 298 | 299 | 300 | @weave.op 301 | async def reflection( 302 | problem: Problem, 303 | incorrect_solution: Solution, 304 | test_report: str, 305 | model: str = STRONG_LLM, 306 | temperature: float = 0.0, 307 | ) -> Reflection: 308 | system_prompt = REFLECTION_INSTRUCTIONS.format( 309 | problem=problem.as_xml, 310 | incorrect_solution=incorrect_solution.as_xml, 311 | test_report=test_report, 312 | ) 313 | messages = [{"role": "system", "content": system_prompt}] 314 | response = await async_client.chat.completions.create( 315 | model=model, 316 | messages=messages, 317 | response_model=None, 318 | temperature=temperature, 319 | max_tokens=MAX_TOKENS, 320 | max_retries=2 321 | ) 322 | try: 323 | formatted_response = await format_response( 324 | text=response.choices[0].message.content, 325 | model=Reflection, 326 | temperature=temperature 327 | ) 328 | return formatted_response 329 | except Exception as e: 330 | err_msg = f"Error formatting response: {e}" 331 | logger.error(err_msg) 332 | raise_in_weave(raise_error=True, msg=err_msg) 333 | return Reflection( 334 | reflection=err_msg, 335 | keywords=err_msg, 336 | step_by_step_solution=err_msg, 337 | instructions=err_msg, 338 | general_advice=err_msg, 339 | ) 340 | 341 | @weave.op 342 | def raise_in_weave(raise_error: bool = False, msg: str = ""): 343 | """ 344 | Show an error message in Weave while continuing execution. 345 | """ 346 | try: 347 | if raise_error: 348 | raise Exception(msg) 349 | else: 350 | pass 351 | except Exception as e: 352 | pass 353 | 354 | @weave.op 355 | async def improve_solution( 356 | problem: Problem, 357 | incorrect_solution: Solution, 358 | test_report: str, 359 | reflections: Reflection, 360 | model: str = STRONG_LLM, 361 | temperature: float = 0.0, 362 | ) -> Solution: 363 | messages = [ 364 | {"role": "system", "content": SOLVER_INSTRUCTIONS}, 365 | {"role": "user", "content": problem.as_xml}, 366 | {"role": "assistant", "content": incorrect_solution.as_xml}, 367 | {"role": "user", "content": f"\n{test_report}\n"}, 368 | { 369 | "role": "user", 370 | "content": "Your previous answer to the question is incorrect. Please reflect on the problem, your solution, and the correct answer.", 371 | }, 372 | {"role": "assistant", "content": reflections.as_xml}, 373 | { 374 | "role": "user", 375 | "content": """Use your self-reflection (above) to help you answer the question correctly. 376 | 377 | --- 378 | Let's think step by step to solve the problem correctly: 379 | """, 380 | }, 381 | ] 382 | response = await async_client.chat.completions.create( 383 | model=model, 384 | messages=messages, 385 | response_model=None, 386 | temperature=temperature, 387 | max_tokens=MAX_TOKENS, 388 | max_retries=2 389 | ) 390 | try: 391 | formatted_response = await format_response( 392 | text=response.choices[0].message.content, 393 | model=Solution, 394 | temperature=temperature 395 | ) 396 | return formatted_response 397 | except Exception as e: 398 | err_msg = f"Error formatting response: {e}" 399 | raise_in_weave(raise_error=True, msg=err_msg) 400 | logger.error(err_msg) 401 | return Solution( 402 | core_question=err_msg, 403 | problem_solving_info=[err_msg], 404 | algorithm=err_msg, 405 | tutorial=err_msg, 406 | plan=err_msg, 407 | pseudocode=err_msg, 408 | source_code=err_msg, 409 | ) 410 | 411 | @weave.op 412 | async def zero_shot_solver( 413 | problem: Problem, model: str = FAST_LLM, temperature: float = 0.7, timeout: int = 10 414 | ) -> dict: 415 | logger.info("Drafting intial zero-shot solution") 416 | solution = await draft_solution( 417 | problem=problem, 418 | model=model, 419 | temperature=temperature, 420 | ) 421 | test_report = await check_correctness( 422 | solution.source_code, problem.sample_input, problem.sample_output, timeout 423 | ) 424 | logger.info(f"Draft solution result: {repr(test_report)}") 425 | return {"solution": solution, "stage": "zero-shot", "test_report": test_report} 426 | 427 | 428 | @weave.op 429 | async def rag_solver( 430 | retriever: Retriever, 431 | problem: Problem, 432 | model: str = FAST_LLM, 433 | temperature: float = 0.0, 434 | timeout: int = 10, 435 | ) -> dict: 436 | zero_shot_result = await zero_shot_solver( 437 | problem=problem, 438 | model=model, 439 | temperature=temperature, 440 | timeout=timeout, 441 | ) 442 | solution = zero_shot_result["solution"] 443 | assert isinstance(solution, Solution), "solution must be a Solution object" 444 | assert solution.source_code, "solution does not contain source_code" 445 | 446 | test_report = zero_shot_result["test_report"] 447 | if test_report == "passed": 448 | return zero_shot_result 449 | logger.info("Iterating on a RAG solution") 450 | 451 | @weave.op 452 | async def generate_sample_solutions_from_code_dataset( 453 | problem: Problem, 454 | solution: Solution, 455 | top_k: int = 50, 456 | top_n: int = 5, 457 | temperature: float = 0.7 458 | ): 459 | ''' 460 | Create example solutions to the problem based on a draft solution. 461 | ''' 462 | logger.info(f"Generating examplars:") 463 | assert isinstance(problem, Problem), "Problem must be a Problem object" 464 | assert isinstance(solution, Solution), "Solution must be a Solution object" 465 | assert solution.source_code, "Solution does not contain source_code" 466 | retrieve_docs = retriever.retrieve(solution.source_code, top_k) 467 | reranked_docs = await rerank_docs(problem, solution, retrieve_docs, top_n) 468 | analyses = await analyze_and_plan_solutions(reranked_docs, temperature) 469 | examplars = format_examples(reranked_docs, analyses) 470 | return examplars 471 | 472 | @weave.op 473 | async def rag_solution( 474 | problem: Problem, 475 | draft_solution: Solution, 476 | model: str = STRONG_LLM, 477 | temperature: float = 0.7, 478 | timeout: int = timeout, 479 | ) -> dict: 480 | logger.info(f"Generating RAG solution:") 481 | examplars = await generate_sample_solutions_from_code_dataset( 482 | problem=problem, 483 | solution=draft_solution, 484 | temperature=temperature 485 | ) 486 | rag_solution = await generate_solution( 487 | problem=problem, 488 | examples=examplars, 489 | model=model, 490 | temperature=temperature, 491 | ) 492 | test_report = await check_correctness( 493 | rag_solution.source_code, 494 | problem.sample_input, 495 | problem.sample_output, 496 | timeout, 497 | ) 498 | logger.info(f"RAG Solution Result: {repr(test_report)}") 499 | return {"solution": rag_solution, "test_report": test_report} 500 | 501 | rag_result = await rag_solution(problem, solution, model, temperature, timeout) 502 | solution = rag_result["solution"] 503 | test_report = rag_result["test_report"] 504 | return {"solution": solution, "stage": "rag", "test_report": test_report} 505 | 506 | 507 | @weave.op 508 | async def rework_solution( 509 | problem: Problem, 510 | incorrect_solution: Solution, 511 | test_report: str, 512 | model: str = STRONG_LLM, 513 | temperature: float = 0.0, 514 | timeout: int = 10, 515 | ) -> dict: 516 | logger.info(f"Reflecting and improving solution") 517 | reflections = await reflection( 518 | problem=problem, 519 | incorrect_solution=incorrect_solution, 520 | test_report=test_report, 521 | model=model, 522 | temperature=temperature, 523 | ) 524 | improved_solution = await improve_solution( 525 | problem=problem, 526 | incorrect_solution=incorrect_solution, 527 | test_report=test_report, 528 | reflections=reflections, 529 | model=model, 530 | temperature=temperature, 531 | ) 532 | test_report = await check_correctness( 533 | improved_solution.source_code, 534 | problem.sample_input, 535 | problem.sample_output, 536 | timeout, 537 | ) 538 | logger.info(f"Reworked solution result: {repr(test_report)}") 539 | return {"solution": improved_solution, "test_report": test_report} 540 | 541 | @weave.op 542 | async def rag_solver_with_reflection( 543 | retriever: Retriever, 544 | problem: Problem, 545 | model: str = FAST_LLM, 546 | temperature: float = 0.7, 547 | max_iterations: int = 2, 548 | code_execution_timeout: int = 10, 549 | ): 550 | num_iterations = 0 551 | while num_iterations < max_iterations: 552 | rag_result = await rag_solver( 553 | retriever=retriever, 554 | problem=problem, 555 | timeout=code_execution_timeout, 556 | model=model, 557 | temperature=temperature, 558 | ) 559 | solution, test_report = rag_result["solution"], rag_result["test_report"] 560 | if test_report.status == "passed": 561 | logger.info(f"Passing solution generated successfully for problem: {problem.problem_name}") 562 | return rag_result 563 | 564 | logger.info(f"Solution failed, reworking solution. Problem: {problem.problem_name}") 565 | rework_result = await rework_solution( 566 | problem=problem, 567 | incorrect_solution=solution, 568 | test_report=test_report, 569 | model=model, 570 | temperature=temperature, 571 | timeout=code_execution_timeout, 572 | ) 573 | solution, test_report = rework_result["solution"], rework_result["test_report"] 574 | if test_report.status == "passed": 575 | logger.info(f"Re-worked solution passed for problem: {problem.problem_name}") 576 | return { 577 | "solution": solution, 578 | "stage": "reflection", 579 | "test_report": test_report, 580 | } 581 | num_iterations += 1 582 | logger.info(f"Re-worked solution failed, trying iteration {num_iterations}. Problem: {problem.problem_name}") 583 | logger.info("Failed to generate a solution after {num_iterations} iterations. Problem: {problem.problem_name}") 584 | return {"solution": solution, "stage": "failed", "test_report": test_report} -------------------------------------------------------------------------------- /download.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from pathlib import Path 3 | import simple_parsing 4 | from huggingface_hub import snapshot_download 5 | 6 | @dataclass 7 | class ScriptArgs: 8 | """Download the dataset per year. Example usage: 9 | python download.py --year 2023 --dataset_folder dataset 10 | """ 11 | year: int = 2023 # year to download 12 | dataset_folder: Path = Path("dataset") # folder to save the dataset 13 | 14 | if __name__ == "__main__": 15 | args = simple_parsing.parse(ScriptArgs) 16 | snapshot_download(repo_id="hackercupai/hackercup", 17 | repo_type="dataset", local_dir=args.dataset_folder, 18 | allow_patterns=[f"{args.year}/*"], 19 | force_download=True) -------------------------------------------------------------------------------- /llamaindex_workflow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\"Open\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "\"Weights\n", 15 | "\n", 16 | "# W&B Lighting Competition - AI Hacker Cup \n", 17 | "\n", 18 | "\n", 19 | "\n", 20 | "[Weights & Biases](https://wandb.ai/site?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup) are running a 7-day Lightning Competition focussed on solving practice problems for the [2024 NeurIPS AI Hacker Cup](https://hackercupai.github.io/) challenge.\n", 21 | "\n", 22 | "#### Goal\n", 23 | "The goal is to try and solve all 5 of the 2023 practice questions for the AI Hacker Cup using MistralAI's models. We’re offering free MistralAI api access via the code in this colab to get people started.\n", 24 | "\n", 25 | "#### Competition GitHub\n", 26 | "The competition [repo here](https://github.com/wandb/aihackercup) contains this colab, the code for the Code Generation Agent and the details on how to make a submission and the competition rules. Note that to run this notebook you'll need to be running it with a T4 GPU (15GB) or larger as the embedding model is run locally.\n", 27 | "\n", 28 | "#### Discord\n", 29 | "You can join the official NeurIPS AI Hacker Cup [discord here](discord.gg/wWeN9hTH32) to share ideas and discuss winning solutions.\n", 30 | "\n", 31 | "## Prizes\n", 32 | "\n", 33 | "Weights & Biases are giving away a pair of Meta Ray-Ban Smart Glasses for the first individual to submit code that solves:\n", 34 | "- 3 out of 5 correct solutions\n", 35 | "- 4 out of 5 correct solutions\n", 36 | "- 5 out of 5 correct solutions\n", 37 | "\n", 38 | "(i.e. in total 3 pairs of sunglasses to give away)\n", 39 | "\n", 40 | "## Entry Submissions, Rules & Deadline\n", 41 | "\n", 42 | "See the [competition README](https://github.com/wandb/aihackercup) for how to make a submissions the the competition rules." 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## W&B Weave\n", 50 | "\n", 51 | "[W&B Weave](https://weave-docs.wandb.ai/tutorial-eval?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup) is used in this competition to run the evaluations. It is a lightweight toolkit for tracking and evaluating LLM applications, built by Weights & Biases. \n", 52 | "\n", 53 | "\n", 54 | "\n", 55 | "If you want to learn more about Weave, you can [get started](https://weave-docs.wandb.ai/quickstart?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup) by decorating Python functions with `@weave.op`." 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "# LlamaIndex Workflow with retries\n", 63 | "\n", 64 | "A simple workflow to solve the problem using LlamaIndex. It is a good example on how to format your codebase to use llamaIndex!" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "## Setup " 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "**Note: You need to run this cell only once**\n", 79 | "We will clone the starter-kits repo\n", 80 | "Set the rag folder as our working directory\n", 81 | "and install the dependencies for the project.\n", 82 | "\n", 83 | "**You can comment out the cell after you have run it once.**" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# Clone the starter-kits repo\n", 93 | "!git clone https://github.com/wandb/aihackercup\n", 94 | "# Change directory to the rag folder. Running the next line twice in the same session will raise an error.\n", 95 | "%cd aihackercup\n", 96 | "# Install dependencies\n", 97 | "!pip install -r requirements.txt -qq" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "To run this colab, create a [free Weights & Biases (W&B) account here](https://wandb.ai/site?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup) and then copy your API key from https://wandb.ai/authorize into the input box below when requested." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "import os\n", 114 | "import weave\n", 115 | "\n", 116 | "WEAVE_PROJECT = \"ai-hacker-cup\"\n", 117 | "weave_client = weave.init(WEAVE_PROJECT)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "# Select MistralAI models used depending if you want a fast or strong LLM\n", 127 | "# You can see the full range of MistralAI models here: https://docs.mistral.ai/getting-started/models/\n", 128 | "FAST_LLM = \"open-mistral-nemo-2407\"\n", 129 | "STRONG_LLM = \"mistral-large-latest\"\n", 130 | "\n", 131 | "os.environ[\"FAST_LLM\"] = STRONG_LLM # We'll use stong model everywhere\n", 132 | "os.environ[\"STRONG_LLM\"] = STRONG_LLM\n", 133 | "\n", 134 | "# URL for the MistralAI api we'll be using\n", 135 | "os.environ[\"BASE_URL\"] = \"http://195.242.25.198:8000/v1\"\n", 136 | "os.environ[\"API_KEY\"] = \"dummy_key\"\n", 137 | "\n", 138 | "# Set the max tokens for the models and how many parallel requests to make in Weave Evaluations\n", 139 | "os.environ[\"MAX_TOKENS\"] = \"4096\"\n", 140 | "os.environ[\"WEAVE_PARALLELISM\"] = \"2\"" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "import asyncio\n", 150 | "import logging\n", 151 | "\n", 152 | "# Start of workout\n", 153 | "from utils import Problem, async_client, STRONG_LLM, format_response, check_correctness" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "## Challenges Dataset\n", 161 | "We will use the **practice** dataset from the **2023** [HackerCup dataset](https://huggingface.co/datasets/hackercupai/hackercup).\n", 162 | "\n", 163 | "We have already processed the dataset and saved it as a [`weave.Dataset`](https://weave-docs.wandb.ai/guides/core-types/datasets/?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup). You can either use the Dataset by running the next cell or download the dataset using the instructions below.\n", 164 | "\n", 165 | "We will use this challenge dataset to load some practice problems and solutions from the HackerCup dataset and evaluate our agents on it." 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "# get dataset\n", 175 | "practice_dataset_uri = \"weave:///parambharat/hackercup/object/practice_dataset:R35fXf9N3FE2IOesg7bRPaPAxiE9YbpirhXO9HcHs8w\"\n", 176 | "problems_dataset = weave.ref(practice_dataset_uri).get().rows[:]\n", 177 | "problems = list(map(lambda x: Problem(**x), problems_dataset))" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "Let's define what we expect as a solution:" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "from pydantic import BaseModel, Field\n", 194 | "\n", 195 | "class Solution(BaseModel):\n", 196 | " core_question: str = Field(..., description=\"Core question of the problem\")\n", 197 | " problem_solving_info: str = Field(..., description=\"Problem-solving information related to the core question\")\n", 198 | " plan: str = Field(..., description=\"Step by step plan to solve the problem\")\n", 199 | " pseudocode: str = Field(..., description=\"Pseudocode to solve the problem\")\n", 200 | " source_code: str = Field(..., description=\"Valid Python3 sourcecode to solve the problem.\")" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "## One Shot Solver with retires\n", 208 | "\n", 209 | "We will use a [llamaIndex workflow](https://docs.llamaindex.ai/en/stable/understanding/workflows/) based approach to solve the problem.\n", 210 | "\n", 211 | "> A workflow is an event-driven, step-based way to control the execution flow of an application.\n", 212 | "\n", 213 | "This structure is very flexible and works perfectly with `weave.op` decorators that we will use to define our steps.\n" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "system_prompt = \"\"\"\n", 223 | "You are a world-class competitive programmer tasked with solving a programming problem. \n", 224 | "You will be provided with a problem statement, and you need to create a Python3 solution for it. \n", 225 | "Your task it to develop a winning solution to the problem in Python3 programming language.\n", 226 | "You will do this in a step-by-step manner.\n", 227 | "\n", 228 | "Step 1: Extract the core question and the problem-solving information from the problem statement.\n", 229 | "Step 2: Generate a step by step plan to solve the problem.\n", 230 | "Step 3: Generate the pseudocode to solve the problem.\n", 231 | "Step 4: Write the final solution in Python3 programming language to solve the problem.\n", 232 | "\n", 233 | "Competition Guidelines:\n", 234 | " a. Do not use any external libraries; stick to Python 3 standard library\n", 235 | " b. Handle input and output using standard input/output (stdin/stdout)\n", 236 | " c. Use helper functions to improve readability of the code.\n", 237 | " c. Use the `input()` function to take input from stdin and print the output to stdout.\n", 238 | " d. Do not add extra print statements otherwise it will fail the test cases.\n", 239 | " e. Make sure your code passes all potential test cases, including edge cases\n", 240 | " f. Follow the input/output format specified in the problem statement and the sample test cases.\"\"\"\n", 241 | "\n", 242 | "prompt_template = \"\"\"\n", 243 | "Let's think step by step to solve the problem:\n", 244 | "\n", 245 | "Problem: \n", 246 | "{problem_description}\n", 247 | "\n", 248 | "Input: \n", 249 | "{sample_input}\n", 250 | "\n", 251 | "Output: \n", 252 | "{sample_output}\n", 253 | "\"\"\"" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "Let's define the events that our workflow will emit.\n", 261 | "> The events attributes are the outputs of the steps in our workflow." 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "from llama_index.core.workflow import (\n", 271 | " StartEvent,\n", 272 | " StopEvent,\n", 273 | " Workflow,\n", 274 | " step,\n", 275 | " Event,\n", 276 | " Context,\n", 277 | ")\n", 278 | "\n", 279 | "class SetupEvent(Event):\n", 280 | " problem: Problem\n", 281 | " test_report: str = None\n", 282 | "\n", 283 | "class SolvedProblemEvent(Event):\n", 284 | " problem: Problem\n", 285 | " problem_solution: str\n", 286 | "\n", 287 | "class FormattedSolutionEvent(Event):\n", 288 | " problem: Problem\n", 289 | " solution: Solution\n" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "Then, we can define our workflow.\n", 297 | "\n", 298 | "Each step in the workflow is a function decorated with `@weave.op` that takes the current context and the event as input and returns the next event.\n", 299 | "\n", 300 | "The `@step` decorator is used to mark the function as a step in the workflow." 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "class OneShotSolverWorkflow(Workflow):\n", 310 | "\n", 311 | " def __init__(self, retries: int = 2, temperature: float = 0.7, code_execution_timeout: int = 30, **kwargs): \n", 312 | " super().__init__(**kwargs)\n", 313 | " self.retries = retries\n", 314 | " self.temperature = temperature\n", 315 | " self.code_execution_timeout = code_execution_timeout\n", 316 | "\n", 317 | " @step\n", 318 | " @weave.op\n", 319 | " async def setup(self, ctx: Context, ev: StartEvent) -> SetupEvent:\n", 320 | " problem = ev.problem\n", 321 | " logging.info(f\"Solving problem: {problem.problem_name}\")\n", 322 | " messages=[\n", 323 | " {\"role\": \"system\", \"content\": ev.system_prompt},\n", 324 | " {\"role\": \"user\", \"content\": ev.prompt_template.format(\n", 325 | " problem_description=problem.problem_description,\n", 326 | " sample_input=problem.sample_input,\n", 327 | " sample_output=problem.sample_output)}\n", 328 | " ]\n", 329 | " await ctx.set(\"messages\", messages)\n", 330 | " return SetupEvent(problem=problem)\n", 331 | " \n", 332 | " @step\n", 333 | " @weave.op\n", 334 | " async def generate_code(self, ctx: Context, ev: SetupEvent) -> SolvedProblemEvent:\n", 335 | " messages = await ctx.get(\"messages\")\n", 336 | " if ev.test_report:\n", 337 | " messages.append({\"role\": \"user\", \"content\": f\"Let's try again. The previous solution was incorrect:\\n {ev.test_report}\"})\n", 338 | " logging.info(\"Calling model to solve the problem\")\n", 339 | " model_output = await async_client.chat.completions.create(\n", 340 | " model=STRONG_LLM,\n", 341 | " messages=messages,\n", 342 | " temperature=self.temperature,\n", 343 | " response_model=None\n", 344 | " )\n", 345 | " problem_solution = model_output.choices[0].message.content\n", 346 | " messages.append({\"role\": \"assistant\", \"content\": problem_solution})\n", 347 | " await ctx.set(\"messages\", messages)\n", 348 | " return SolvedProblemEvent(problem=ev.problem, problem_solution=problem_solution)\n", 349 | "\n", 350 | " @step\n", 351 | " @weave.op\n", 352 | " async def format_solution(self, ev: SolvedProblemEvent) -> FormattedSolutionEvent:\n", 353 | " logging.info(\"Formatting the response\")\n", 354 | " solution = await format_response(ev.problem_solution, Solution)\n", 355 | " return FormattedSolutionEvent(problem=ev.problem, solution=solution)\n", 356 | "\n", 357 | " @step\n", 358 | " @weave.op\n", 359 | " async def check_solution(self, ev: FormattedSolutionEvent) -> StopEvent:\n", 360 | " logging.info(\"Checking if the code is correct\")\n", 361 | " test_report = await check_correctness(\n", 362 | " ev.solution.source_code,\n", 363 | " ev.problem.sample_input,\n", 364 | " ev.problem.sample_output,\n", 365 | " timeout=self.code_execution_timeout,\n", 366 | " )\n", 367 | " logging.info(f\"Test report: {test_report}\")\n", 368 | " if (test_report.status != \"passed\") and self.retries > 0:\n", 369 | " logging.info(f\"Retrying the solution. Retries left: {self.retries}\")\n", 370 | " self.retries -= 1\n", 371 | " return SetupEvent(problem=ev.problem, test_report=test_report.message)\n", 372 | " else:\n", 373 | " return StopEvent(result={\"solution\": ev.solution, \"test_report\": test_report})\n" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "# Evaluation" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": {}, 386 | "source": [ 387 | "Now we are ready to evaluate against the expected solutions.\n", 388 | "\n", 389 | "### Create a Weave Model\n", 390 | "First we create a Weave [\"Model\"](https://weave-docs.wandb.ai/guides/core-types/models?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup), which has a `predict` function that Weave Evaluations will call to generate a solution. It also has various attributes that we can set to adjust the behaviour of our pipeline." 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": null, 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "\n", 400 | "class OneShotSolverWithRetries(weave.Model):\n", 401 | " llm_model: str = STRONG_LLM\n", 402 | " system_prompt: str = system_prompt\n", 403 | " prompt_template: str = prompt_template\n", 404 | " temperature: float = 0.7\n", 405 | " code_execution_timeout: int = 30\n", 406 | " retries: int = 2\n", 407 | "\n", 408 | " @weave.op\n", 409 | " async def predict(self, problem: dict):\n", 410 | " workflow = OneShotSolverWorkflow(\n", 411 | " temperature=self.temperature,\n", 412 | " retries=self.retries,\n", 413 | " code_execution_timeout=self.code_execution_timeout,\n", 414 | " timeout=600, # timeout for the entire workflow\n", 415 | " \n", 416 | " )\n", 417 | " problem_obj = Problem(**problem)\n", 418 | " \n", 419 | " result = await workflow.run(\n", 420 | " problem=problem_obj,\n", 421 | " system_prompt=self.system_prompt,\n", 422 | " prompt_template=self.prompt_template,\n", 423 | " )\n", 424 | " \n", 425 | " return result\n", 426 | "\n", 427 | "model = OneShotSolverWithRetries()" 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": {}, 433 | "source": [ 434 | "### Create the Evals Dataset and a Scorer" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "We expect the output of the \"test_report\" from our agent above to be `\"passed\"` if the solution is correct. You can think of `expected_result` in the `evals_dataset` as the label that the `test_report` from our solver needs to return in order to ensure the generated solution is correct. In this case the scoring is actually happening in our agentic pipeline as the agent needs to know the result so it can decide whether or not to retry.\n", 442 | "\n", 443 | "Weave Evaluations expects data formatted as a list of dictionaries for the evaluation dataset. We dump `problem` as a dictionary." 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [ 452 | "evals_dataset = [{\"problem\": problem.model_dump(), \"expected_result\": \"passed\"} for problem in problems]" 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "metadata": {}, 458 | "source": [ 459 | "Weave Evaluations use a scorer function that returns a metric and its result in a dict. Here we define a metric that checks if the code generated by agent passed the test case" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "@weave.op\n", 469 | "def scorer(expected_result: str, model_output: dict) -> dict:\n", 470 | " if model_output is None or model_output[\"test_report\"].status is None:\n", 471 | " return {\"solution_passed\": False}\n", 472 | " return {\"solution_passed\": expected_result == model_output[\"test_report\"].status} # check if the test_report status == passed" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "evaluator = weave.Evaluation(dataset=evals_dataset, scorers=[scorer], trials=1)\n", 482 | "\n", 483 | "results = await evaluator.evaluate(model)" 484 | ] 485 | } 486 | ], 487 | "metadata": { 488 | "accelerator": "GPU", 489 | "colab": { 490 | "include_colab_link": true, 491 | "provenance": [], 492 | "toc_visible": true 493 | }, 494 | "kernelspec": { 495 | "display_name": "Python 3", 496 | "name": "python3" 497 | } 498 | }, 499 | "nbformat": 4, 500 | "nbformat_minor": 2 501 | } 502 | -------------------------------------------------------------------------------- /llamaindex_workflow.py: -------------------------------------------------------------------------------- 1 | import os 2 | import weave 3 | import asyncio 4 | import logging 5 | from pydantic import BaseModel, Field 6 | from llama_index.core.workflow import ( 7 | StartEvent, 8 | StopEvent, 9 | Workflow, 10 | step, 11 | Event, 12 | Context, 13 | ) 14 | 15 | # Select MistralAI models used depending if you want a fast or strong LLM 16 | # You can see the full range of MistralAI models here: https://docs.mistral.ai/getting-started/models/ 17 | FAST_LLM = "open-mistral-nemo-2407" 18 | STRONG_LLM = "mistral-large-latest" 19 | 20 | os.environ["FAST_LLM"] = STRONG_LLM # We'll use stong model everywhere 21 | os.environ["STRONG_LLM"] = STRONG_LLM 22 | 23 | # URL for the MistralAI api we'll be using 24 | os.environ["BASE_URL"] = "http://195.242.25.198:8000/v1" 25 | os.environ["API_KEY"] = "dummy_key" 26 | 27 | # Set the max tokens for the models and how many parallel requests to make in Weave Evaluations 28 | os.environ["MAX_TOKENS"] = "4096" 29 | os.environ["WEAVE_PARALLELISM"] = "2" 30 | 31 | weave.init("llamaindex-workflow") 32 | 33 | from utils import Problem, async_client, STRONG_LLM, format_response, check_correctness 34 | 35 | logging.basicConfig( 36 | format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO 37 | ) 38 | logger = logging.getLogger(__name__) 39 | 40 | # get dataset 41 | practice_dataset_uri = "weave:///parambharat/hackercup/object/practice_dataset:R35fXf9N3FE2IOesg7bRPaPAxiE9YbpirhXO9HcHs8w" 42 | problems_dataset = weave.ref(practice_dataset_uri).get().rows[:] 43 | problems = list(map(lambda x: Problem(**x), problems_dataset)) 44 | 45 | class Solution(BaseModel): 46 | core_question: str = Field(..., description="Core question of the problem") 47 | problem_solving_info: str = Field(..., description="Problem-solving information related to the core question") 48 | plan: str = Field(..., description="Step by step plan to solve the problem") 49 | pseudocode: str = Field(..., description="Pseudocode to solve the problem") 50 | source_code: str = Field(..., description="Valid Python3 sourcecode to solve the problem.") 51 | 52 | system_prompt = """ 53 | You are a world-class competitive programmer tasked with solving a programming problem. 54 | You will be provided with a problem statement, and you need to create a Python3 solution for it. 55 | Your task it to develop a winning solution to the problem in Python3 programming language. 56 | You will do this in a step-by-step manner. 57 | 58 | Step 1: Extract the core question and the problem-solving information from the problem statement. 59 | Step 2: Generate a step by step plan to solve the problem. 60 | Step 3: Generate the pseudocode to solve the problem. 61 | Step 4: Write the final solution in Python3 programming language to solve the problem. 62 | 63 | Competition Guidelines: 64 | a. Do not use any external libraries; stick to Python 3 standard library 65 | b. Handle input and output using standard input/output (stdin/stdout) 66 | c. Use helper functions to improve readability of the code. 67 | c. Use the `input()` function to take input from stdin and print the output to stdout. 68 | d. Do not add extra print statements otherwise it will fail the test cases. 69 | e. Make sure your code passes all potential test cases, including edge cases 70 | f. Follow the input/output format specified in the problem statement and the sample test cases.""" 71 | 72 | prompt_template = """ 73 | Let's think step by step to solve the problem: 74 | 75 | Problem: 76 | {problem_description} 77 | 78 | Input: 79 | {sample_input} 80 | 81 | Output: 82 | {sample_output} 83 | """ 84 | class SetupEvent(Event): 85 | problem: Problem 86 | test_report: str = None 87 | 88 | class SolvedProblemEvent(Event): 89 | problem: Problem 90 | problem_solution: str 91 | 92 | class FormattedSolutionEvent(Event): 93 | problem: Problem 94 | solution: Solution 95 | 96 | class OneShotSolverWorkflow(Workflow): 97 | retries: int = 2 98 | temperature: float = 0.7 99 | code_execution_timeout: int = 30 100 | 101 | @step 102 | @weave.op 103 | async def setup(self, ctx: Context, ev: StartEvent) -> SetupEvent: 104 | problem = ev.problem 105 | logging.info(f"Solving problem: {problem.problem_name}") 106 | messages=[ 107 | {"role": "system", "content": ev.system_prompt}, 108 | {"role": "user", "content": ev.prompt_template.format( 109 | problem_description=problem.problem_description, 110 | sample_input=problem.sample_input, 111 | sample_output=problem.sample_output)} 112 | ] 113 | await ctx.set("messages", messages) 114 | return SetupEvent(problem=problem) 115 | 116 | @step 117 | @weave.op 118 | async def generate_code(self, ctx: Context, ev: SetupEvent) -> SolvedProblemEvent: 119 | messages = await ctx.get("messages") 120 | if ev.test_report: 121 | messages.append({"role": "user", "content": f"Let's try again. The previous solution was incorrect:\n {ev.test_report}"}) 122 | logging.info("Calling model to solve the problem") 123 | model_output = await async_client.chat.completions.create( 124 | model=STRONG_LLM, 125 | messages=messages, 126 | temperature=self.temperature, 127 | response_model=None 128 | ) 129 | problem_solution = model_output.choices[0].message.content 130 | messages.append({"role": "assistant", "content": problem_solution}) 131 | await ctx.set("messages", messages) 132 | return SolvedProblemEvent(problem=ev.problem, problem_solution=problem_solution) 133 | 134 | @step 135 | @weave.op 136 | async def format_solution(self, ev: SolvedProblemEvent) -> FormattedSolutionEvent: 137 | logging.info("Formatting the response") 138 | solution = await format_response(ev.problem_solution, Solution) 139 | return FormattedSolutionEvent(problem=ev.problem, solution=solution) 140 | 141 | @step 142 | @weave.op 143 | async def check_solution(self, ev: FormattedSolutionEvent) -> StopEvent: 144 | logging.info("Checking if the code is correct") 145 | test_report = await check_correctness( 146 | ev.solution.source_code, 147 | ev.problem.sample_input, 148 | ev.problem.sample_output, 149 | timeout=self.code_execution_timeout, 150 | ) 151 | logging.info(f"Test report: {test_report}") 152 | if (test_report.status != "passed") and self.retries > 0: 153 | logging.info(f"Retrying the solution. Retries left: {self.retries}") 154 | self.retries -= 1 155 | return SetupEvent(problem=ev.problem, test_report=test_report.message) 156 | else: 157 | return StopEvent(result={"solution": ev.solution, "test_report": test_report}) 158 | 159 | 160 | class OneShotSolver(weave.Model): 161 | code_execution_timeout: int = 30 162 | llm_model: str = STRONG_LLM 163 | system_prompt: str = system_prompt 164 | prompt_template: str = prompt_template 165 | temperature: float = 0.7 166 | 167 | @weave.op 168 | async def predict(self, problem: dict): 169 | workflow = OneShotSolverWorkflow(timeout=600, verbose=True) 170 | problem_obj = Problem(**problem) 171 | 172 | result = await workflow.run( 173 | problem=problem_obj, 174 | system_prompt=self.system_prompt, 175 | prompt_template=self.prompt_template, 176 | temperature=self.temperature, 177 | code_execution_timeout=self.code_execution_timeout 178 | ) 179 | 180 | return result 181 | 182 | model = OneShotSolver() 183 | 184 | evals_dataset = [{"problem": problem.model_dump(), "expected_result": "passed"} for problem in problems] 185 | 186 | # #run one example 187 | # result = asyncio.run(model.predict(evals_dataset[0]["problem"])) 188 | # print(result) 189 | 190 | @weave.op 191 | def scorer(expected_result: str, model_output: dict) -> dict: 192 | if model_output is None or model_output["test_report"].status is None: 193 | return {"solution_passed": False} 194 | return {"solution_passed": expected_result == model_output["test_report"].status} 195 | 196 | logger.info("Creating evaluator") 197 | evaluator = weave.Evaluation(dataset=evals_dataset, scorers=[scorer], trials=1) 198 | 199 | logger.info(f"Evaluating model: {model}") 200 | results = asyncio.run(evaluator.evaluate(model)) 201 | logger.info(f"Evaluation results: {results}") 202 | -------------------------------------------------------------------------------- /mistral.py: -------------------------------------------------------------------------------- 1 | from mistralai.client import MistralClient 2 | 3 | client = MistralClient() 4 | 5 | 6 | prompt = "hola"*25 7 | prompt = """\n\n\n"An apple a day keeps the doctor away" is Steve's motto. His other motto, "You can never have too much of a good thing," holds true for both apples and mottos. Steve would like to eat two apples per day for the next \\(N\\) days, but with strict adherence to his third motto "Consistency is key." Specifically, he'd like the sum of the two apple weights he eats over the next \\(N\\) days to be the same for each day.\nSteve has already purchased \\(2*N-1\\) apples, the \\(i\\)th of which weighs \\(A_i\\) ounces. He\'d like to buy one more apple that\'s as light as possible to fulfill his goal. Steve can buy an apple of any positive integer weight in ounces from the store. Is it possible for him to reach his goal, and if so, what weight apple should he buy?\n{{PHOTO_ID:1563872647765708|WIDTH:600}}\n*The above image depicts the solution to the first sample. Each day, Steve will eat two apples totalling \\(7\\) oz. Steve must buy a \\(4\\) oz apple to make this happen.*\n# Constraints\n\\(1 \\leq T \\leq 70\\)\n\\(1 \\leq N \\leq 3*10^5\\)\nThe sum of \\(N\\) over all cases is at most \\(600{,}000\\)\n\\(1 \\leq A_i \\leq 10^9\\)\n# Input Format\nInput begins with an integer \\(T\\), the number of test cases. Each test case starts with a single integer \\(N\\). The next line contains \\(2*N-1\\) space-separated integers \\(A_1, ..., A_{2*N - 1}\\).\n# Output Format\nFor the \\(i\\)th test case, print "`Case #i:` " followed a single integer, the smallest possible apple weight in ounces that Steve can buy so that he can eat two apples for the next \\(N\\) days and have the sum of apple weights be the same every day, or \\(-1\\) if doing so is impossible.\n# Sample Explanation\nIn the first case, if Steve buys a \\(4\\) oz apple, he can group his apples as shown above. For this input, there\'s no way to succeed by buying any apple below \\(4\\) oz.\nIn the second case, Steve can buy a \\(7\\) oz apple, and eat two apples totaling \\(14\\) oz each day.\nIn the third case, any apple weight will suffice, so Steve will buy the lightest one possible.\nIn the fourth case, no matter what weight apple Steve attempts to buy, it is impossible for him to achieve his goal.\nPlease note, as demonstrated in the seventh case, that it\'s possible for the answer to exceed \\(10^9\\).\n\n\n\n\n7\n3\n6 3 1 2 5\n2\n7 7 7\n1\n1\n3\n1 9 1 1 4\n4\n1 9 1 1 4 9 9\n4\n1 9 10 1 4 6 9\n3\n1000000000 2 10 4 999999994\n\n\n\nCase #1: 4\nCase #2: 7\nCase #3: 1\nCase #4: -1\nCase #5: 6\nCase #6: -1\nCase #7: 1000000002\n\n\n\n\n""" 8 | 9 | chat_response = client.chat( 10 | model="open-mistral-nemo", 11 | messages=[dict(role="system", content="You are a helpful assistant"), 12 | dict(role="user", content=prompt)], 13 | # max_tokens=32 14 | temperature=0.0 15 | ) 16 | 17 | all_content = chat_response.choices[0].message.content 18 | 19 | print(all_content) -------------------------------------------------------------------------------- /one_shot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import weave 3 | import asyncio 4 | import logging 5 | from pydantic import BaseModel, Field 6 | 7 | WEAVE_PROJECT = "ai-hacker-cup" 8 | weave_client = weave.init(WEAVE_PROJECT) 9 | 10 | os.environ["FAST_LLM"] = "gpt-4o" 11 | os.environ["STRONG_LLM"] = "o1-preview" 12 | os.environ["API_KEY"] = os.environ["OPENAI_API_KEY"] 13 | os.environ["MAX_TOKENS"] = "8000" 14 | os.environ["WEAVE_PARALLELISM"] = "5" 15 | 16 | 17 | # Start of workout 18 | from utils import Problem, async_client, STRONG_LLM, format_response, check_correctness 19 | 20 | logging.basicConfig( 21 | format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO 22 | ) 23 | logger = logging.getLogger(__name__) 24 | 25 | # get dataset 26 | practice_dataset_uri = "weave:///parambharat/hackercup/object/practice_dataset:R35fXf9N3FE2IOesg7bRPaPAxiE9YbpirhXO9HcHs8w" 27 | problems_dataset = weave.ref(practice_dataset_uri).get().rows[:] 28 | problems = list(map(lambda x: Problem(**x), problems_dataset)) 29 | 30 | 31 | class Solution(BaseModel): 32 | core_question: str = Field(..., description="Core question of the problem") 33 | problem_solving_info: str = Field(..., description="Problem-solving information related to the core question") 34 | plan: str = Field(..., description="Step by step plan to solve the problem") 35 | pseudocode: str = Field(..., description="Pseudocode to solve the problem") 36 | source_code: str = Field(..., description="Valid Python3 sourcecode to solve the problem.") 37 | 38 | 39 | 40 | system_prompt = """ 41 | You are a world-class competitive programmer tasked with solving a programming problem. 42 | You will be provided with a problem statement, and you need to create a Python3 solution for it. 43 | Your task it to develop a winning solution to the problem in Python3 programming language. 44 | You will do this in a step-by-step manner. 45 | 46 | Step 1: Extract the core question and the problem-solving information from the problem statement. 47 | Step 2: Generate a step by step plan to solve the problem. 48 | Step 3: Generate the pseudocode to solve the problem. 49 | Step 4: Write the final solution in Python3 programming language to solve the problem. 50 | 51 | Competition Guidelines: 52 | a. Do not use any external libraries; stick to Python 3 standard library 53 | b. Handle input and output using standard input/output (stdin/stdout) 54 | c. Use helper functions to improve readability of the code. 55 | c. Use the `input()` function to take input from stdin and print the output to stdout. 56 | d. Do not add extra print statements otherwise it will fail the test cases. 57 | e. Make sure your code passes all potential test cases, including edge cases 58 | f. Follow the input/output format specified in the problem statement and the sample test cases.""" 59 | 60 | prompt_template = """ 61 | Let's think step by step to solve the problem: 62 | 63 | Problem: 64 | {problem_description} 65 | 66 | Input: 67 | {sample_input} 68 | 69 | Output: 70 | {sample_output} 71 | """ 72 | 73 | @weave.op 74 | async def one_shot_solver( 75 | problem: Problem, 76 | system_prompt: str, 77 | prompt_template: str, 78 | temperature: float = 0.7, 79 | timeout: int = 10 80 | ) -> str: 81 | logging.info(f"Solving problem: {problem.problem_name}") 82 | 83 | @weave.op 84 | def format_prompt(system_prompt: str, prompt_template: str, problem: Problem) -> str: 85 | return system_prompt + prompt_template.format( 86 | problem_description=problem.problem_description, 87 | sample_input=problem.sample_input, 88 | sample_output=problem.sample_output 89 | ) 90 | 91 | # call model one first time to get the code 92 | logging.info("Calling model to solve the problem") 93 | model_output = await async_client.chat.completions.create( 94 | model=STRONG_LLM, 95 | messages=[ 96 | {"role": "user", "content": format_prompt(system_prompt, prompt_template, problem)} 97 | ], 98 | response_model=None 99 | ) 100 | 101 | out = model_output.choices[0].message.content 102 | 103 | # extract code from the response 104 | logging.info("Formatting the response") 105 | solution = await format_response(out, Solution) 106 | 107 | # check if the code is correct 108 | logging.info("Checking if the code is correct") 109 | test_report = await check_correctness( 110 | solution.source_code, 111 | problem.sample_input, 112 | problem.sample_output, 113 | timeout=timeout, 114 | ) 115 | 116 | return {"solution": solution, "test_report": test_report} 117 | 118 | 119 | class OneShotSolver(weave.Model): 120 | code_execution_timeout: int = 30 121 | llm_model: str = STRONG_LLM 122 | system_prompt: str = system_prompt 123 | prompt_template: str = prompt_template 124 | temperature: float = 0.7 125 | 126 | @weave.op 127 | async def predict(self, problem: dict): 128 | return await one_shot_solver( 129 | problem=Problem(**problem), 130 | system_prompt=self.system_prompt, 131 | prompt_template=self.prompt_template, 132 | timeout=self.code_execution_timeout, 133 | temperature=self.temperature 134 | ) 135 | 136 | model = OneShotSolver() 137 | 138 | 139 | evals_dataset = [{"problem": problem.model_dump(), "expected_result": "passed"} for problem in problems] 140 | 141 | @weave.op 142 | def scorer(expected_result: str, model_output: dict) -> dict: 143 | if model_output is None or model_output["test_report"] is None: 144 | return {"solution_passed": False} 145 | return {"solution_passed": expected_result == model_output["test_report"]} 146 | 147 | logger.info("Creating evaluator") 148 | evaluator = weave.Evaluation(dataset=evals_dataset, scorers=[scorer], trials=1) 149 | 150 | logger.info(f"Evaluating model: {model}") 151 | results = asyncio.run(evaluator.evaluate(model)) 152 | logger.info(f"Evaluation results: {results}") 153 | -------------------------------------------------------------------------------- /one_shot_o1.py: -------------------------------------------------------------------------------- 1 | import os 2 | import weave 3 | import asyncio 4 | import logging 5 | from pathlib import Path 6 | from pydantic import BaseModel, Field 7 | 8 | WEAVE_PROJECT = "ai-hacker-cup" 9 | weave_client = weave.init(WEAVE_PROJECT) 10 | 11 | STRONG_LLM = "o1-preview" 12 | FAST_LLM = "gpt-4o" 13 | os.environ["FAST_LLM"] = FAST_LLM 14 | os.environ["API_KEY"] = os.environ["OPENAI_API_KEY"] 15 | os.environ["WEAVE_PARALLELISM"] = "5" 16 | 17 | 18 | # Start of workout 19 | from utils import async_client, format_response, check_correctness, find_problems, Problem, maybe_remove_backticks 20 | 21 | logging.basicConfig( 22 | format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO 23 | ) 24 | logger = logging.getLogger(__name__) 25 | 26 | # get current dataset! 27 | ds2024 = Path("./2024/practice") 28 | problems = find_problems(ds2024) 29 | 30 | class Solution(BaseModel): 31 | solution_explanation: str = Field(..., description="Explanation of the solution to the problem") 32 | source_code: str = Field(..., description="Valid Python3 sourcecode to solve the problem.") 33 | 34 | 35 | 36 | prompt_template = """You will be provided with a problem statement, and you need to create a Python3 solution for it. 37 | Write the final solution in Python3 programming language to solve the problem. 38 | 39 | ## Competition Guidelines: 40 | a. Do not use any external libraries; stick to Python 3 standard library 41 | b. Handle input and output using standard input/output (stdin/stdout) 42 | c. Use helper functions to improve readability of the code. 43 | c. Use the `input()` function to take input from stdin and print the output to stdout. 44 | d. Do not add extra print statements otherwise it will fail the test cases. 45 | e. Make sure your code passes all potential test cases, including edge cases 46 | f. Follow the input/output format specified in the problem statement and the sample test cases. 47 | g. We will run the program by calling `python3 program.py` so make sure it outputs the correct results. 48 | 49 | {problem_description} 50 | 51 | ## Sample Input: 52 | {sample_input} 53 | 54 | ## Expected Output: 55 | {sample_output} 56 | """ 57 | 58 | @weave.op 59 | async def o1_solver( 60 | problem: Problem, 61 | prompt_template: str, 62 | timeout: int = 10 63 | ) -> str: 64 | logging.info(f"Solving problem: {problem.problem_name}") 65 | 66 | @weave.op 67 | def format_prompt(prompt_template: str, problem: Problem) -> str: 68 | return prompt_template.format( 69 | problem_description=problem.problem_description, 70 | sample_input=problem.sample_input, 71 | sample_output=problem.sample_output 72 | ) 73 | 74 | # call model one first time to get the code 75 | logging.info("Calling o1 to solve the problem") 76 | model_output = await async_client.chat.completions.create( 77 | model=STRONG_LLM, 78 | messages=[ 79 | {"role": "user", "content": format_prompt(prompt_template, problem)} 80 | ], 81 | response_model=None 82 | ) 83 | 84 | out = model_output.choices[0].message.content 85 | 86 | # extract code from the response 87 | logging.info("Formatting the response") 88 | solution = await format_response(out, Solution) 89 | solution.source_code = maybe_remove_backticks(solution.source_code) 90 | 91 | # check if the code is correct 92 | logging.info("Checking if the code is correct") 93 | test_report = await check_correctness( 94 | solution.source_code, 95 | problem.sample_input, 96 | problem.sample_output, 97 | timeout=timeout, 98 | ) 99 | logging.info("Checking if the code is correct for the full problem") 100 | input_data = problem.problem_input.read_text() 101 | expected_output = problem.problem_output.read_text() 102 | test_report_full = await check_correctness( 103 | solution.source_code, 104 | input_data, 105 | expected_output, 106 | timeout=timeout, 107 | ) 108 | 109 | return {"solution": solution, 110 | "test_report": test_report, 111 | "test_report_full": test_report_full} 112 | 113 | 114 | class O1ShotSolver(weave.Model): 115 | code_execution_timeout: int = 30 116 | llm_model: str = STRONG_LLM 117 | prompt_template: str = prompt_template 118 | 119 | @weave.op 120 | async def predict(self, problem: dict): 121 | return await o1_solver( 122 | problem=Problem(**problem), 123 | prompt_template=self.prompt_template, 124 | timeout=self.code_execution_timeout, 125 | ) 126 | 127 | model = O1ShotSolver() 128 | 129 | evals_dataset = [{"problem": problem.model_dump(), "expected_result": "passed"} for problem in problems] 130 | 131 | @weave.op 132 | def scorer(expected_result: str, model_output: dict) -> dict: 133 | if model_output is None or model_output["test_report"].status is None: 134 | return {"solution_passed": False} 135 | return {"passed_sample": expected_result == model_output["test_report"].status, 136 | "passed_full": expected_result == model_output["test_report_full"].status} 137 | 138 | logger.info("Creating evaluator") 139 | evaluator = weave.Evaluation(dataset=evals_dataset, scorers=[scorer], trials=1) 140 | 141 | logger.info(f"Evaluating model: {model}") 142 | results = asyncio.run(evaluator.evaluate(model)) 143 | logger.info(f"Evaluation results: {results}") 144 | -------------------------------------------------------------------------------- /one_shot_solver.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\"Open\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "\"Weights\n", 15 | "\n", 16 | "# W&B Lighting Competition - AI Hacker Cup \n", 17 | "\n", 18 | "\n", 19 | "\n", 20 | "[Weights & Biases](https://wandb.ai/site?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup) are running a 7-day Lightning Competition focussed on solving practice problems for the [2024 NeurIPS AI Hacker Cup](https://hackercupai.github.io/) challenge.\n", 21 | "\n", 22 | "#### Goal\n", 23 | "The goal is to try and solve all 5 of the 2023 practice questions for the AI Hacker Cup using MistralAI's models. We’re offering free MistralAI api access via the code in this colab to get people started.\n", 24 | "\n", 25 | "#### Competition GitHub\n", 26 | "The competition [repo here](https://github.com/wandb/aihackercup) contains this colab, the code for the Code Generation Agent and the details on how to make a submission and the competition rules. Note that to run this notebook you'll need to be running it with a T4 GPU (15GB) or larger as the embedding model is run locally.\n", 27 | "\n", 28 | "#### Discord\n", 29 | "You can join the official NeurIPS AI Hacker Cup [discord here](discord.gg/wWeN9hTH32) to share ideas and discuss winning solutions.\n", 30 | "\n", 31 | "## Prizes\n", 32 | "\n", 33 | "Weights & Biases are giving away a pair of Meta Ray-Ban Smart Glasses for the first individual to submit code that solves:\n", 34 | "- 3 out of 5 correct solutions\n", 35 | "- 4 out of 5 correct solutions\n", 36 | "- 5 out of 5 correct solutions\n", 37 | "\n", 38 | "(i.e. in total 3 pairs of sunglasses to give away)\n", 39 | "\n", 40 | "## Entry Submissions, Rules & Deadline\n", 41 | "\n", 42 | "See the [competition README](https://github.com/wandb/aihackercup) for how to make a submissions the the competition rules." 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## W&B Weave\n", 50 | "\n", 51 | "[W&B Weave](https://weave-docs.wandb.ai/tutorial-eval?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup) is used in this competition to run the evaluations. It is a lightweight toolkit for tracking and evaluating LLM applications, built by Weights & Biases. \n", 52 | "\n", 53 | "\n", 54 | "\n", 55 | "If you want to learn more about Weave, you can [get started](https://weave-docs.wandb.ai/quickstart?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup) by decorating Python functions with `@weave.op`." 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "# A simple one-shot solver for the AI Hacker Cup 2024 Qualification Round" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## Setup " 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "**Note: You need to run this cell only once**\n", 77 | "We will clone the starter-kits repo\n", 78 | "Set the rag folder as our working directory\n", 79 | "and install the dependencies for the project.\n", 80 | "\n", 81 | "**You can comment out the cell after you have run it once.**" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# Clone the starter-kits repo\n", 91 | "!git clone https://github.com/wandb/aihackercup\n", 92 | "# Change directory to the rag folder. Running the next line twice in the same session will raise an error.\n", 93 | "%cd aihackercup\n", 94 | "# Install dependencies\n", 95 | "!pip install -r requirements.txt -qq" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "To run this colab, create a [free Weights & Biases (W&B) account here](https://wandb.ai/site?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup) and then copy your API key from https://wandb.ai/authorize into the input box below when requested." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "import os\n", 112 | "import weave\n", 113 | "\n", 114 | "WEAVE_PROJECT = \"ai-hacker-cup\"\n", 115 | "weave_client = weave.init(WEAVE_PROJECT)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "# Select MistralAI models used depending if you want a fast or strong LLM\n", 125 | "# You can see the full range of MistralAI models here: https://docs.mistral.ai/getting-started/models/\n", 126 | "FAST_LLM = \"open-mistral-nemo-2407\"\n", 127 | "STRONG_LLM = \"mistral-large-latest\"\n", 128 | "\n", 129 | "os.environ[\"FAST_LLM\"] = STRONG_LLM # We'll use stong model everywhere\n", 130 | "os.environ[\"STRONG_LLM\"] = STRONG_LLM\n", 131 | "\n", 132 | "# URL for the MistralAI api we'll be using\n", 133 | "os.environ[\"BASE_URL\"] = \"http://195.242.25.198:8000/v1\"\n", 134 | "os.environ[\"API_KEY\"] = \"dummy_key\"\n", 135 | "\n", 136 | "# Set the max tokens for the models and how many parallel requests to make in Weave Evaluations\n", 137 | "os.environ[\"MAX_TOKENS\"] = \"4096\"\n", 138 | "os.environ[\"WEAVE_PARALLELISM\"] = \"2\"" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "import asyncio\n", 148 | "import logging\n", 149 | "\n", 150 | "# Start of workout\n", 151 | "from utils import Problem, async_client, format_response, check_correctness" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "## Challenges Dataset\n", 159 | "We will use the **practice** dataset from the **2023** [HackerCup dataset](https://huggingface.co/datasets/hackercupai/hackercup).\n", 160 | "\n", 161 | "We have already processed the dataset and saved it as a [`weave.Dataset`](https://weave-docs.wandb.ai/guides/core-types/datasets/?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup). You can either use the Dataset by running the next cell or download the dataset using the instructions below.\n", 162 | "\n", 163 | "We will use this challenge dataset to load some practice problems and solutions from the HackerCup dataset and evaluate our agents on it." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "# get dataset\n", 173 | "practice_dataset_uri = \"weave:///parambharat/hackercup/object/practice_dataset:R35fXf9N3FE2IOesg7bRPaPAxiE9YbpirhXO9HcHs8w\"\n", 174 | "problems_dataset = weave.ref(practice_dataset_uri).get().rows[:]\n", 175 | "problems = list(map(lambda x: Problem(**x), problems_dataset))" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "Let's define what we expect as a solution:" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "from pydantic import BaseModel, Field\n", 192 | "\n", 193 | "class Solution(BaseModel):\n", 194 | " core_question: str = Field(..., description=\"Core question of the problem\")\n", 195 | " problem_solving_info: str = Field(..., description=\"Problem-solving information related to the core question\")\n", 196 | " plan: str = Field(..., description=\"Step by step plan to solve the problem\")\n", 197 | " pseudocode: str = Field(..., description=\"Pseudocode to solve the problem\")\n", 198 | " source_code: str = Field(..., description=\"Valid Python3 sourcecode to solve the problem.\")" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "## One Shot Solver\n", 206 | "\n", 207 | "Here we define the One Shot Solver pipeline which:\n", 208 | "- takes a problem as input\n", 209 | "- generates a solution using a large language model\n", 210 | "- executes the generated code\n", 211 | "- checks if the executed code produces the correct output\n", 212 | "- returns the solution and test report\n", 213 | "The solver uses a system prompt and template to guide the LLM in generating\n", 214 | "a step-by-step solution, including core question extraction, problem-solving plan,\n", 215 | "pseudocode, and final Python code.\n" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "system_prompt = \"\"\"\n", 225 | "You are a world-class competitive programmer tasked with solving a programming problem. \n", 226 | "You will be provided with a problem statement, and you need to create a Python3 solution for it. \n", 227 | "Your task it to develop a winning solution to the problem in Python3 programming language.\n", 228 | "You will do this in a step-by-step manner.\n", 229 | "\n", 230 | "Step 1: Extract the core question and the problem-solving information from the problem statement.\n", 231 | "Step 2: Generate a step by step plan to solve the problem.\n", 232 | "Step 3: Generate the pseudocode to solve the problem.\n", 233 | "Step 4: Write the final solution in Python3 programming language to solve the problem.\n", 234 | "\n", 235 | "Competition Guidelines:\n", 236 | " a. Do not use any external libraries; stick to Python 3 standard library\n", 237 | " b. Handle input and output using standard input/output (stdin/stdout)\n", 238 | " c. Use helper functions to improve readability of the code.\n", 239 | " c. Use the `input()` function to take input from stdin and print the output to stdout.\n", 240 | " d. Do not add extra print statements otherwise it will fail the test cases.\n", 241 | " e. Make sure your code passes all potential test cases, including edge cases\n", 242 | " f. Follow the input/output format specified in the problem statement and the sample test cases.\"\"\"\n", 243 | "\n", 244 | "prompt_template = \"\"\"\n", 245 | "Let's think step by step to solve the problem:\n", 246 | "\n", 247 | "Problem: \n", 248 | "{problem_description}\n", 249 | "\n", 250 | "Input: \n", 251 | "{sample_input}\n", 252 | "\n", 253 | "Output: \n", 254 | "{sample_output}\n", 255 | "\"\"\"" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "\n", 265 | "@weave.op\n", 266 | "async def one_shot_solver(\n", 267 | " problem: Problem, \n", 268 | " llm_model: str,\n", 269 | " system_prompt: str, \n", 270 | " prompt_template: str,\n", 271 | " temperature: float = 0.7,\n", 272 | " timeout: int = 10\n", 273 | ") -> str:\n", 274 | " logging.info(f\"Solving problem: {problem.problem_name}\")\n", 275 | "\n", 276 | " # call model one first time to get the code\n", 277 | " logging.info(\"Calling model to solve the problem\")\n", 278 | " model_output = await async_client.chat.completions.create(\n", 279 | " model=llm_model,\n", 280 | " messages=[\n", 281 | " {\"role\": \"system\", \"content\": system_prompt},\n", 282 | " {\"role\": \"user\", \"content\": prompt_template.format(\n", 283 | " problem_description=problem.problem_description,\n", 284 | " sample_input=problem.sample_input,\n", 285 | " sample_output=problem.sample_output)}\n", 286 | " ],\n", 287 | " temperature=temperature,\n", 288 | " response_model=None\n", 289 | " )\n", 290 | "\n", 291 | " out = model_output.choices[0].message.content\n", 292 | "\n", 293 | " # extract code from the response\n", 294 | " logging.info(\"Formatting the response\")\n", 295 | " solution = await format_response(out, Solution)\n", 296 | "\n", 297 | " # check if the code is correct\n", 298 | " logging.info(\"Checking if the code is correct\")\n", 299 | " test_report = await check_correctness(\n", 300 | " solution.source_code,\n", 301 | " problem.sample_input,\n", 302 | " problem.sample_output,\n", 303 | " timeout=timeout,\n", 304 | " )\n", 305 | "\n", 306 | " return {\"solution\": solution, \"test_report\": test_report}" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": {}, 312 | "source": [ 313 | "# Evaluation" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "Now we are ready to evaluate against the expected solutions.\n", 321 | "\n", 322 | "### Create a Weave Model\n", 323 | "First we create a Weave [\"Model\"](https://weave-docs.wandb.ai/guides/core-types/models?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup), which has a `predict` function that Weave Evaluations will call to generate a solution. It also has various attributes that we can set to adjust the behaviour of our pipeline." 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "class OneShotSolver(weave.Model):\n", 333 | " code_execution_timeout: int = 30\n", 334 | " llm_model: str = STRONG_LLM\n", 335 | " system_prompt: str = system_prompt\n", 336 | " prompt_template: str = prompt_template\n", 337 | " temperature: float = 0.7\n", 338 | "\n", 339 | " @weave.op\n", 340 | " async def predict(self, problem: dict):\n", 341 | " return await one_shot_solver(\n", 342 | " problem=Problem(**problem), \n", 343 | " llm_model=self.llm_model,\n", 344 | " system_prompt=self.system_prompt, \n", 345 | " prompt_template=self.prompt_template, \n", 346 | " timeout=self.code_execution_timeout,\n", 347 | " temperature=self.temperature\n", 348 | " )" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "### Create the Evals Dataset and a Scorer" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": {}, 361 | "source": [ 362 | "We expect the output of the \"test_report\" from our agent above to be `\"passed\"` if the solution is correct. You can think of `expected_result` in the `evals_dataset` as the label that the `test_report` from our solver needs to return in order to ensure the generated solution is correct. In this case the scoring is actually happening in our agentic pipeline as the agent needs to know the result so it can decide whether or not to retry.\n", 363 | "\n", 364 | "Weave Evaluations expects data formatted as a list of dictionaries for the evaluation dataset. We dump `problem` as a dictionary." 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "evals_dataset = [{\"problem\": problem.model_dump(), \"expected_result\": \"passed\"} for problem in problems]" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "Weave Evaluations use a scorer function that returns a metric and its result in a dict. Here we define a metric that checks if the code generated by agent passed the test case" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "@weave.op\n", 390 | "def scorer(expected_result: str, model_output: dict) -> dict:\n", 391 | " if model_output is None or model_output[\"test_report\"].status is None:\n", 392 | " return {\"solution_passed\": False}\n", 393 | " return {\"solution_passed\": expected_result == model_output[\"test_report\"].status} # check if the test_report status == passed" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "model = OneShotSolver()\n", 403 | "\n", 404 | "evaluator = weave.Evaluation(dataset=evals_dataset, scorers=[scorer], trials=1)\n", 405 | "\n", 406 | "results = await evaluator.evaluate(model)" 407 | ] 408 | } 409 | ], 410 | "metadata": { 411 | "accelerator": "GPU", 412 | "colab": { 413 | "include_colab_link": true, 414 | "provenance": [], 415 | "toc_visible": true 416 | }, 417 | "kernelspec": { 418 | "display_name": "Python 3", 419 | "name": "python3" 420 | } 421 | }, 422 | "nbformat": 4, 423 | "nbformat_minor": 2 424 | } 425 | -------------------------------------------------------------------------------- /rag_code_agent.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\"Open\n", 8 | "" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "\"Weights\n", 16 | "\n", 17 | "# W&B Lighting Competition - AI Hacker Cup \n", 18 | "\n", 19 | "\n", 20 | "\n", 21 | "[Weights & Biases](https://wandb.ai/site?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup) are running a 7-day Lightning Competition focussed on solving practice problems for the [2024 NeurIPS AI Hacker Cup](https://hackercupai.github.io/) challenge.\n", 22 | "\n", 23 | "#### Goal\n", 24 | "The goal is to try and solve all 5 of the 2023 practice questions for the AI Hacker Cup using MistralAI's models. We’re offering free MistralAI api access via the code in this colab to get people started.\n", 25 | "\n", 26 | "#### Competition GitHub\n", 27 | "The competition [repo here](https://github.com/wandb/aihackercup) contains this colab, the code for the Code Generation Agent and the details on how to make a submission and the competition rules. Note that to run this notebook you'll need to be running it with a T4 GPU (15GB) or larger as the embedding model is run locally.\n", 28 | "\n", 29 | "#### Discord\n", 30 | "You can join the official NeurIPS AI Hacker Cup [discord here](discord.gg/wWeN9hTH32) to share ideas and discuss winning solutions.\n", 31 | "\n", 32 | "## Prizes\n", 33 | "\n", 34 | "Weights & Biases are giving away a pair of Meta Ray-Ban Smart Glasses for the first individual to submit code that solves:\n", 35 | "- 3 out of 5 correct solutions\n", 36 | "- 4 out of 5 correct solutions\n", 37 | "- 5 out of 5 correct solutions\n", 38 | "\n", 39 | "(i.e. in total 3 pairs of sunglasses to give away)\n", 40 | "\n", 41 | "## Entry Submissions, Rules & Deadline\n", 42 | "\n", 43 | "See the [competition README](https://github.com/wandb/aihackercup) for how to make a submissions the the competition rules." 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## W&B Weave\n", 51 | "\n", 52 | "[W&B Weave](https://weave-docs.wandb.ai/tutorial-eval?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup) is used in this competition to run the evaluations. It is a lightweight toolkit for tracking and evaluating LLM applications, built by Weights & Biases. \n", 53 | "\n", 54 | "\n", 55 | "\n", 56 | "If you want to learn more about Weave, you can [get started](https://weave-docs.wandb.ai/quickstart?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup) by decorating Python functions with `@weave.op`." 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "# Using RAG for a Code Generation Agent\n", 64 | "\n", 65 | "This colab demonstrates how to retrieve over a dataset of coding question-answer pairs (the [CodeContests](https://huggingface.co/datasets/deepmind/code_contests) dataset from DeepMind) in order to find simlar questions that might help our Agent generate the correct solution.\n", 66 | "\n", 67 | "A more detailed walkthough of the approach we will use in this notebook can be found in the following **[Youtube video](https://www.youtube.com/watch?v=cObBj2UpWK8)**:\n", 68 | "\n", 69 | "\n", 70 | "\n", 71 | "" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "## Setup " 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "**Note: You need to run this cell only once**\n", 86 | "We will clone the starter-kits repo\n", 87 | "Set the rag folder as our working directory\n", 88 | "and install the dependencies for the project.\n", 89 | "\n", 90 | "**You can comment out the cell after you have run it once.**" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "# Clone the starter-kits repo\n", 100 | "!git clone https://github.com/wandb/aihackercup\n", 101 | "# Change directory to the rag folder. Running the next line twice in the same session will raise an error.\n", 102 | "%cd aihackercup\n", 103 | "# Install dependencies\n", 104 | "!pip install -r requirements.txt -qq" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "To run this colab, create a [free Weights & Biases (W&B) account here](https://wandb.ai/site?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup) and then copy your API key from https://wandb.ai/authorize into the input box below when requested." 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "import os\n", 121 | "import weave\n", 122 | "\n", 123 | "WEAVE_PROJECT = \"ai-hacker-cup\"\n", 124 | "weave_client = weave.init(WEAVE_PROJECT)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "# Select MistralAI models used depending if you want a fast or strong LLM\n", 134 | "# You can see the full range of MistralAI models here: https://docs.mistral.ai/getting-started/models/\n", 135 | "FAST_LLM = \"open-mistral-nemo-2407\"\n", 136 | "STRONG_LLM = \"mistral-large-latest\"\n", 137 | "\n", 138 | "os.environ[\"FAST_LLM\"] = STRONG_LLM # We'll use stong model everywhere\n", 139 | "os.environ[\"STRONG_LLM\"] = STRONG_LLM\n", 140 | "\n", 141 | "# URL for the MistralAI api we'll be using\n", 142 | "os.environ[\"BASE_URL\"] = \"http://195.242.25.198:8000/v1\"\n", 143 | "\n", 144 | "# Set the max tokens for the models and how many parallel requests to make in Weave Evaluations\n", 145 | "os.environ[\"MAX_TOKENS\"] = \"4096\"\n", 146 | "os.environ[\"WEAVE_PARALLELISM\"] = \"2\"" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "## Challenges Dataset\n", 154 | "We will use the **practice** dataset from the **2023** [HackerCup dataset](https://huggingface.co/datasets/hackercupai/hackercup).\n", 155 | "\n", 156 | "We have already processed the dataset and saved it as a [`weave.Dataset`](https://weave-docs.wandb.ai/guides/core-types/datasets/?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup). You can either use the Dataset by running the next cell or download the dataset using the instructions below.\n", 157 | "\n", 158 | "We will use this challenge dataset to load some practice problems and solutions from the HackerCup dataset and evaluate our agents on it." 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "from agent import rag_solver, rework_solution\n", 168 | "from utils import Problem\n", 169 | "\n", 170 | "practice_dataset_uri = \"weave:///parambharat/hackercup/object/practice_dataset:R35fXf9N3FE2IOesg7bRPaPAxiE9YbpirhXO9HcHs8w\"\n", 171 | "problems_dataset = weave.ref(practice_dataset_uri).get().rows[:]\n", 172 | "problems = list(map(lambda x: Problem(**x), problems_dataset))\n", 173 | "problem = problems[2] # Select the first problem\n", 174 | "\n", 175 | "print(\"Sample Problem:\\n\\n\", problem.model_dump_json(indent=2))" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "#### [Alternative] Download the raw challenges dataset\n", 183 | "\n", 184 | "You can alternatively download the full raw challenges dataset, see the README to see how." 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "#### Turn on logging and asyncio for notebooks" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "import asyncio\n", 201 | "import logging\n", 202 | "from nest_asyncio import apply\n", 203 | "\n", 204 | "apply()\n", 205 | "logging.basicConfig(\n", 206 | " format=\"%(asctime)s : %(levelname)s : %(message)s\", level=logging.INFO\n", 207 | ")\n", 208 | "logger = logging.getLogger(__name__)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "## Running a RAG + Reflection Agent" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "### RAG Agent with Reflection\n", 223 | "\n", 224 | "We will combine a RAG Agent with Reflection in order to:\n", 225 | "\n", 226 | "- Retrieve similar types of questions from the CodeContests dataset, generate a solution, reflect on the solution and test results and improve it.\n", 227 | "- We then use this improved solution to generate new few-shot examples and repeat the process in a loop until we converge to a solution or the iteration limit is reached.\n", 228 | "\n", 229 | "`agent.py` contains the prompts used for analysis (`ANALYSIS_INSTRUCTIONS`), reflection (`REFLECTION_INSTRUCTIONS`) and problem solving (`SOLVER_INSTRUCTIONS`) feel free to edit them to improve the system." 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "from agent import REFLECTION_INSTRUCTIONS\n", 239 | "\n", 240 | "print(REFLECTION_INSTRUCTIONS)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "### Retriever\n", 248 | "\n", 249 | "The code used the retrieval over the CodeContests dataset can be found in `retriever.py`. You'll see we're using the `jinaai/jina-embeddings-v2-base-code` embedding model locally as it has been trained on code. \n", 250 | "\n", 251 | "Here we'll initialise our retriever." 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "from retriever import Retriever\n", 261 | "\n", 262 | "retriever = Retriever()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "### RAG Solver Pipeline\n", 270 | "\n", 271 | "Here we run the code generation pipeline which:\n", 272 | "- given a problem, retrieves similar problems from the CodeCompletions dataset\n", 273 | "- generates candidate code for problem\n", 274 | "- executes the code\n", 275 | "- checks if the executed code generates the correct solution\n", 276 | "- if the solution is correct, it terminates otherwise it retries for `max_iterations`\n", 277 | "\n", 278 | "Note `code_execution_timeout`is used to limit the time available for the generated python code to execute as sometimes the code generated be recursive code that never terminates." 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "@weave.op\n", 288 | "async def rag_solver_with_reflection(\n", 289 | " retriever: Retriever,\n", 290 | " problem: Problem,\n", 291 | " model: str = FAST_LLM,\n", 292 | " temperature: float = 0.7,\n", 293 | " max_iterations: int = 2,\n", 294 | " code_execution_timeout: int = 10,\n", 295 | "):\n", 296 | " num_iterations = 0\n", 297 | " while num_iterations < max_iterations:\n", 298 | " rag_result = await rag_solver(\n", 299 | " retriever=retriever,\n", 300 | " problem=problem,\n", 301 | " timeout=code_execution_timeout,\n", 302 | " model=model,\n", 303 | " temperature=temperature,\n", 304 | " )\n", 305 | " solution, test_report = rag_result[\"solution\"], rag_result[\"test_report\"]\n", 306 | " if test_report.status == \"passed\":\n", 307 | " logger.info(f\"Passing solution generated successfully for problem: {problem.problem_name}\")\n", 308 | " return rag_result\n", 309 | " \n", 310 | " logger.info(f\"Solution failed, reworking solution. Problem: {problem.problem_name}\")\n", 311 | " rework_result = await rework_solution(\n", 312 | " problem=problem,\n", 313 | " incorrect_solution=solution,\n", 314 | " test_report=test_report,\n", 315 | " model=model,\n", 316 | " temperature=temperature,\n", 317 | " timeout=code_execution_timeout,\n", 318 | " )\n", 319 | " solution, test_report = rework_result[\"solution\"], rework_result[\"test_report\"]\n", 320 | " if test_report.status == \"passed\":\n", 321 | " logger.info(f\"Re-worked solution passed for problem: {problem.problem_name}\")\n", 322 | " return {\n", 323 | " \"solution\": solution,\n", 324 | " \"stage\": \"reflection\",\n", 325 | " \"test_report\": test_report,\n", 326 | " }\n", 327 | " num_iterations += 1\n", 328 | " logger.info(f\"Re-worked solution failed, trying iteration {num_iterations}. Problem: {problem.problem_name}\")\n", 329 | " logger.info(\"Failed to generate a solution after {num_iterations} iterations. Problem: {problem.problem_name}\")\n", 330 | " return {\"solution\": solution, \"stage\": \"failed\", \"test_report\": test_report}" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "Lets run the pipeline on 1 problem, **this will take about 7 minutes to complete** as it makes a lot of LLM calls and runs multiple iterations." 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "reflection_result = await rag_solver_with_reflection(\n", 347 | " retriever, problem, STRONG_LLM, max_iterations=2, code_execution_timeout=30\n", 348 | ")\n", 349 | "\n", 350 | "print(\"*\" * 40 + \" SOLUTION: \" + \"*\" * 40)\n", 351 | "print(reflection_result[\"solution\"].source_code)\n", 352 | "print(\"*\" * 40 + \" TEST REPORT \" + \"*\" * 40)\n", 353 | "print(reflection_result[\"test_report\"])" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "# Evaluation" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "Now we are ready to evaluate against the expected solutions.\n", 368 | "\n", 369 | "### Create a Weave Model\n", 370 | "First we create a Weave [\"Model\"](https://weave-docs.wandb.ai/guides/core-types/models?utm_source=colab&utm_medium=code&utm_campaign=lightning-ai-hacker-cup), which has a `predict` function that Weave Evaluations will call to generate a solution. It also has various attributes that we can set to adjust the behaviour of our pipeline." 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "class RAGReflectionAgent(weave.Model):\n", 380 | " retriever: Retriever\n", 381 | " max_iterations: int = 2\n", 382 | " code_execution_timeout: int = 30\n", 383 | " model: str = STRONG_LLM\n", 384 | " temperature: float = 0.7\n", 385 | "\n", 386 | " @weave.op\n", 387 | " async def predict(self, problem: dict):\n", 388 | " return await rag_solver_with_reflection(\n", 389 | " self.retriever,\n", 390 | " Problem(**problem),\n", 391 | " model=self.model,\n", 392 | " temperature=self.temperature,\n", 393 | " max_iterations=self.max_iterations,\n", 394 | " code_execution_timeout=self.code_execution_timeout,\n", 395 | " )" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "### Create the Evals Dataset and a Scorer" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "We expect the output of the \"test_report\" from our agent above to be `\"passed\"` if the solution is correct. You can think of `expected_result` in the `evals_dataset` as the label that the `test_report` from our solver needs to return in order to ensure the generated solution is correct. In this case the scoring is actually happening in our agentic pipeline as the agent needs to know the result so it can decide whether or not to retry.\n", 410 | "\n", 411 | "Weave Evaluations expects data formatted as a list of dictionaries for the evaluation dataset. We dump `problem` as a dictionary." 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "evals_dataset = [{\"problem\": problem.model_dump(), \"expected_result\": \"passed\"} for problem in problems]" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "Weave Evaluations use a scorer function that returns a metric and its result in a dict. Here we define a metric that checks if the code generated by agent passed the test case" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "@weave.op\n", 437 | "def scorer(expected_result: str, model_output: dict) -> dict:\n", 438 | " if model_output is None or model_output[\"test_report\"].status is None:\n", 439 | " return {\"solution_passed\": False}\n", 440 | " return {\"solution_passed\": expected_result == model_output[\"test_report\"].status} # check if the test_report status == passed" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "### Run the Evaluation\n", 448 | "Now we instantiate the Agent and run the evaluation. Results from the evaluation will be printed in the W&B Weave UI. The WEAVE_PARALLELISM env var determines how many evaluations are run in parallel and is set at 2 by default, each can take 7 to 9 minutes." 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "# Evaluate the RAG reflection agent\n", 458 | "tasks = []\n", 459 | "\n", 460 | "LLM = STRONG_LLM\n", 461 | "eval_temperature = 0.7\n", 462 | "\n", 463 | "# Instantiate the agent, which is a subclass of `weave.Model`\n", 464 | "rag_reflection_agent = RAGReflectionAgent(\n", 465 | " retriever=retriever, model=LLM, temperature=eval_temperature, code_execution_timeout=30\n", 466 | ")\n", 467 | "\n", 468 | "# Weave Evaluations take a dataset and scoring functions.\n", 469 | "# This evaluation checks if the code generated by the agent passes\n", 470 | "# trials can be set to run the full evaluation multiple times\n", 471 | "evaluator = weave.Evaluation(dataset=evals_dataset, scorers=[scorer], trials=1)\n", 472 | "\n", 473 | "# Evaluate the agent by passing it to the evaluator\n", 474 | "# Weave Evaluations are async, so we use `asyncio.gather` to run them in parallel\n", 475 | "# The WEAVE_PARALLELISM environment variable sets the number of evaluations to run in parallel\n", 476 | "rag_reflection_results = evaluator.evaluate(rag_reflection_agent)\n", 477 | "tasks.append(rag_reflection_results)\n", 478 | "rag_reflection_results = await asyncio.gather(*tasks)\n", 479 | "\n", 480 | "logger.info(rag_reflection_results)" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "You will now be able to find your evaluation results in the Weights & Biases UI in the Evaluations tab. You can find a link to your Weave project under the cell above that calls `weave.init`" 488 | ] 489 | } 490 | ], 491 | "metadata": { 492 | "accelerator": "GPU", 493 | "colab": { 494 | "include_colab_link": true, 495 | "provenance": [], 496 | "toc_visible": true 497 | }, 498 | "kernelspec": { 499 | "display_name": "Python 3", 500 | "name": "python3" 501 | } 502 | }, 503 | "nbformat": 4, 504 | "nbformat_minor": 2 505 | } 506 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bm25s==0.1.10 2 | datasets==2.21.0 3 | joblib==1.4.2 4 | pandas==2.2.2 5 | pydantic==2.8.2 6 | scikit_learn==1.5.1 7 | simple_parsing==0.1.5 8 | tree_sitter_languages==1.10.2 9 | tree-sitter==0.21.3 10 | weave==0.51.2 11 | sentence-transformers==3.0.1 12 | openai==1.43.1 13 | instructor==1.4.0 -------------------------------------------------------------------------------- /retriever.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import logging 3 | import os 4 | from pathlib import Path 5 | from typing import List, Optional 6 | 7 | import bm25s 8 | import pandas as pd 9 | import weave 10 | from datasets import load_dataset 11 | from joblib import Parallel, delayed 12 | from sentence_transformers import SentenceTransformer 13 | from sentence_transformers.util import cos_sim 14 | from simple_parsing import ArgumentParser 15 | 16 | from utils import Problem, Solution, clean_code_string, remove_extra_newlines 17 | 18 | logging.basicConfig( 19 | format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO 20 | ) 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | # Data Loading 25 | 26 | LANGUAGE_MAP = { 27 | 3: "Python3", 28 | } 29 | 30 | 31 | def clean_code(row: dict) -> dict: 32 | outputs = [] 33 | for item in row["code"]: 34 | item = clean_code_string(item) 35 | outputs.append(item) 36 | return {"code": outputs} 37 | 38 | 39 | def get_solution(row: dict) -> dict: 40 | solutions = row["solutions"] 41 | languages = solutions["language"] 42 | solutions = solutions["solution"] 43 | 44 | outputs = [] 45 | for language, solution in zip(languages, solutions): 46 | language = LANGUAGE_MAP.get(language) 47 | if language: 48 | outputs.append(solution) 49 | return {"code": outputs} 50 | 51 | 52 | def get_test_cases(row: dict) -> dict: 53 | tests = row["public_tests"] 54 | return { 55 | "sample_inputs": "".join(tests["input"]), 56 | "sample_outputs": "".join(tests["output"]), 57 | } 58 | 59 | 60 | def clean_description(row: dict) -> dict: 61 | description = row["description"] 62 | description = remove_extra_newlines(description) 63 | return {"description": description} 64 | 65 | 66 | def get_code_contests_data(cache_file: Path, reload_cache: bool = False): 67 | if cache_file.exists() and not reload_cache: 68 | logger.info(f"Loading cached raw data from {cache_file}") 69 | return pd.read_json(cache_file, lines=True) 70 | 71 | logger.info(f"Loading raw data from dataset") 72 | ds = load_dataset("deepmind/code_contests") 73 | 74 | train_ds = ds["train"].map(get_solution, num_proc=4) 75 | train_ds = train_ds.filter(lambda x: not x["is_description_translated"], num_proc=4) 76 | train_ds = train_ds.filter(lambda x: len(x["code"]) > 0, num_proc=4) 77 | train_ds = train_ds.map(clean_code, num_proc=4) 78 | train_ds = train_ds.map(clean_description, num_proc=4) 79 | train_ds = train_ds.map(get_test_cases, num_proc=4) 80 | train_ds = train_ds.remove_columns( 81 | [ 82 | col 83 | for col in train_ds.column_names 84 | if col not in ["description", "code", "sample_inputs", "sample_outputs"] 85 | ] 86 | ) 87 | 88 | train_df = train_ds.to_pandas() 89 | train_df = train_df.explode("code").reset_index(drop=True) 90 | train_df = train_df.drop_duplicates(subset=["code"], keep="first") 91 | train_df.to_json(cache_file, orient="records", lines=True) 92 | return train_df 93 | 94 | 95 | # Data Preprocessing 96 | 97 | # Define a mapping from AST node types to token names 98 | TOKEN_MAP = { 99 | ast.FunctionDef: "FUNC_DEF", 100 | ast.ClassDef: "CLASS_DEF", 101 | ast.BinOp: "BIN_OP", 102 | ast.Assign: "ASSIGN", 103 | ast.Expr: "EXPR", 104 | ast.Call: "FUNC_CALL", 105 | ast.If: "IF", 106 | ast.For: "FOR", 107 | ast.While: "WHILE", 108 | ast.Import: "IMPORT", 109 | ast.Return: "RETURN", 110 | ast.List: "LIST", 111 | ast.Dict: "DICT", 112 | ast.Name: "VAR", 113 | ast.Num: "NUMBER", # For older Python versions (< 3.8) 114 | ast.Constant: lambda node: ( 115 | "NUMBER" 116 | if isinstance(node.value, (int, float, complex)) 117 | else ( 118 | "STRING" 119 | if isinstance(node.value, str) 120 | else ( 121 | "BOOLEAN" 122 | if isinstance(node.value, bool) 123 | else "NONE" if node.value is None else "UNKNOWN" 124 | ) 125 | ) 126 | ), 127 | } 128 | 129 | 130 | def tokenize_node(node): 131 | """Tokenizes an AST node using the TOKEN_MAP dictionary.""" 132 | node_type = type(node) 133 | 134 | # Handle the case where the node type is in the TOKEN_MAP 135 | if node_type in TOKEN_MAP: 136 | token = TOKEN_MAP[node_type] 137 | if callable( 138 | token 139 | ): # If the token is a function (for complex cases like ast.Constant) 140 | yield token(node) 141 | else: 142 | yield token 143 | 144 | # Recursively process child nodes 145 | for child in ast.iter_child_nodes(node): 146 | yield from tokenize_node(child) 147 | 148 | 149 | def normalize_code(code: str) -> Optional[str]: 150 | """Tokenizes and normalizes any Python code snippet.""" 151 | try: 152 | tree = ast.parse(code) 153 | except SyntaxError as e: 154 | return None 155 | 156 | tokens = list(tokenize_node(tree)) 157 | return " ".join(tokens) 158 | 159 | @weave.op 160 | def normalize_code_list(code_list: list[str]) -> list[str]: 161 | if len(code_list) > 1000: 162 | return Parallel(n_jobs=-1)(delayed(normalize_code)(code) for code in code_list) 163 | else: 164 | return [normalize_code(code) for code in code_list] 165 | 166 | 167 | def preprocess_data( 168 | input_path: Path, output_path: Path, reload_cache: bool = False 169 | ) -> pd.DataFrame: 170 | if output_path.exists() and not reload_cache: 171 | logger.info(f"Loading cached preprocessed data from {output_path}") 172 | return pd.read_json(output_path, lines=True) 173 | 174 | logger.info(f"Preprocessing data from {input_path}") 175 | data_df = pd.read_json(input_path, lines=True) 176 | data_df["normalized_code"] = normalize_code_list(data_df["code"].tolist()) 177 | data_df = data_df.dropna(subset=["normalized_code"]) 178 | data_df.to_json(output_path, orient="records", lines=True) 179 | return data_df 180 | 181 | 182 | class Retriever: 183 | def __init__(self, path: str = "param-bharat/rag-hackercup"): 184 | ds = load_dataset(path, split="train") 185 | data_df = ds.to_pandas() 186 | self.docs = data_df.to_dict(orient="records") 187 | self.corpus = data_df["normalized_code"] 188 | self.retriever = self.index() 189 | 190 | def index(self): 191 | corpus = self.corpus.tolist() 192 | corpus_tokens = bm25s.tokenize(corpus, stopwords=None) 193 | retriever = bm25s.BM25(corpus=corpus) 194 | retriever.index(corpus_tokens) 195 | return retriever 196 | 197 | @weave.op 198 | def retrieve(self, query: str, k: int = 10): 199 | clean_query = clean_code_string(query) 200 | normalized_query = normalize_code(clean_query) 201 | query_tokens = bm25s.tokenize(normalized_query, stopwords=None) 202 | docs, _ = self.retriever.retrieve(query_tokens, k=k, corpus=self.docs) 203 | return docs[0, :].tolist() 204 | 205 | 206 | def index_data( 207 | input_path: Path, 208 | output_path: Path, 209 | reload_cache: bool = False, 210 | ): 211 | if output_path.exists() and not reload_cache: 212 | logger.info(f"Loading cached retriever from {output_path}") 213 | return Retriever.load(output_path) 214 | logger.info(f"Creating retriever from {input_path}") 215 | data_df = pd.read_json(input_path, lines=True, orient="records") 216 | retriever = Retriever(data_df=data_df) 217 | retriever.index() 218 | retriever.save(output_path) 219 | return retriever 220 | 221 | 222 | class RerankModel: 223 | def __init__(self): 224 | self.model = SentenceTransformer( 225 | "jinaai/jina-embeddings-v2-base-code", trust_remote_code=True 226 | ) 227 | 228 | # control your input sequence length up to 8192 229 | self.model.max_seq_length = 1024 230 | 231 | @weave.op 232 | def __call__( 233 | self, 234 | problem: Problem, 235 | solution: Solution, 236 | retrieved_docs: List[dict], 237 | top_k: int = 3, 238 | ): 239 | query_text = problem.problem_description + " " + solution.source_code 240 | context_text = [ 241 | doc["description"] + " " + doc["code"] for doc in retrieved_docs 242 | ] 243 | 244 | query_embeddings = self.model.encode([query_text]) 245 | context_embeddings = self.model.encode(context_text, batch_size=2) 246 | similarities = cos_sim(query_embeddings, context_embeddings) 247 | docs_df = pd.DataFrame(retrieved_docs) 248 | docs_df["similarity"] = similarities[0] 249 | docs_df = docs_df.sort_values(by="similarity", ascending=False) 250 | docs_df = docs_df.drop_duplicates( 251 | subset=["description"], 252 | keep="first", 253 | ) 254 | return docs_df.head(top_k).to_dict(orient="records") 255 | 256 | 257 | rerank_model = RerankModel() 258 | 259 | 260 | @weave.op 261 | async def rerank_docs( 262 | problem: Problem, 263 | solution: Solution, 264 | retrieved_docs: List[dict], 265 | top_k: int = 3, 266 | ) -> List[dict]: 267 | return rerank_model(problem, solution, retrieved_docs, top_k) 268 | 269 | 270 | if __name__ == "__main__": 271 | 272 | parser = ArgumentParser() 273 | parser.add_argument("-c", "--cache-directory", type=Path, default="data/cache") 274 | parser.add_argument("--reload-cache", action="store_true") 275 | 276 | args = parser.parse_args() 277 | 278 | if not args.cache_directory.exists(): 279 | args.cache_directory.mkdir(parents=True) 280 | 281 | if (args.cache_directory / "retriever").exists(): 282 | retriever = Retriever.load(args.cache_directory / "retriever") 283 | elif (args.cache_directory / "preprocessed.jsonl").exists(): 284 | preprocessed_df = preprocess_data( 285 | args.cache_directory / "raw.jsonl", 286 | args.cache_directory / "preprocessed.jsonl", 287 | args.reload_cache, 288 | ) 289 | retriever = Retriever(data_df=preprocessed_df) 290 | retriever.index() 291 | retriever.save(args.cache_directory / "retriever") 292 | else: 293 | raw_df = get_code_contests_data( 294 | args.cache_directory / "raw.jsonl", args.reload_cache 295 | ) 296 | preprocessed_df = preprocess_data( 297 | args.cache_directory / "raw.jsonl", 298 | args.cache_directory / "preprocessed.jsonl", 299 | args.reload_cache, 300 | ) 301 | retriever = Retriever(data_df=preprocessed_df) 302 | retriever.index() 303 | retriever.save(args.cache_directory / "retriever") 304 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import multiprocessing 3 | import os 4 | import pathlib 5 | from pathlib import Path 6 | import queue 7 | import re 8 | import subprocess 9 | import sys 10 | import logging 11 | import time 12 | import traceback 13 | from typing import Any, List 14 | import math 15 | 16 | import weave 17 | import openai 18 | import instructor 19 | 20 | from pydantic import BaseModel, Field 21 | from tree_sitter_languages import get_language, get_parser 22 | 23 | 24 | # API params 25 | BASE_URL = os.getenv("BASE_URL", None) 26 | API_KEY = os.getenv("API_KEY", "dummy_key") 27 | 28 | # params 29 | MAX_TOKENS = int(os.getenv("MAX_TOKENS", 4096)) 30 | FAST_LLM = os.getenv("FAST_LLM", "open-mistral-nemo-2407") 31 | STRONG_LLM = os.getenv("STRONG_LLM", "mistral-large-latest") 32 | 33 | # API client 34 | oai_client = openai.AsyncOpenAI(base_url=BASE_URL, api_key=API_KEY) 35 | async_client = instructor.from_openai(oai_client, mode=instructor.Mode.JSON) 36 | 37 | language = get_language("python") 38 | tree_parser = get_parser("python") 39 | 40 | import re 41 | 42 | def maybe_remove_backticks(solution: str) -> str: 43 | "Remove backticks from the solution" 44 | solution = solution.strip() 45 | solution = re.sub(r'^```python\s*', '', solution) 46 | solution = re.sub(r'\s*```$', '', solution) 47 | return solution 48 | 49 | 50 | def remove_extra_newlines(text: str) -> str: 51 | # Use regex to replace 2 or more newlines (with possible whitespace in between) with a single newline 52 | text = re.sub(r"\n\s*\n+", "\n", text) 53 | return text 54 | 55 | 56 | def remove_comments_and_docstrings(code): 57 | # Define queries to capture comments and docstrings 58 | doc_str_pattern = """ 59 | (module . (expression_statement (string)) @module_doc_str) 60 | (class_definition body: (block . (expression_statement (string)) @class_doc_str)) 61 | (function_definition body: (block . (expression_statement (string)) @function_doc_str)) 62 | """ 63 | 64 | comment_pattern = "(comment) @comment" 65 | # Parse the code 66 | tree = tree_parser.parse(code.encode()) 67 | root_node = tree.root_node 68 | 69 | # Query the tree for docstrings and comments 70 | doc_str_query = language.query(doc_str_pattern) 71 | doc_strs = doc_str_query.captures(root_node) 72 | 73 | comment_query = language.query(comment_pattern) 74 | comments = comment_query.captures(root_node) 75 | 76 | # Get the start and end points of all docstrings and comments 77 | doc_str_points = set((node.start_byte, node.end_byte) for node, _ in doc_strs) 78 | comment_points = set((node.start_byte, node.end_byte) for node, _ in comments) 79 | 80 | # Create a set of all points to remove 81 | remove_points = doc_str_points.union(comment_points) 82 | 83 | # Reconstruct the code, skipping over the parts to remove 84 | cleaned_code = [] 85 | last_index = 0 86 | for start, end in sorted(remove_points): 87 | if last_index < start: 88 | cleaned_code.append(code[last_index:start]) 89 | last_index = end 90 | 91 | # Add any remaining code after the last comment/docstring 92 | cleaned_code.append(code[last_index:]) 93 | 94 | return "".join(cleaned_code) 95 | 96 | 97 | def clean_code_string(code: str) -> str: 98 | code = remove_comments_and_docstrings(code) 99 | code = remove_extra_newlines(code) 100 | return code 101 | 102 | class TestReport(BaseModel): 103 | status: str 104 | message: str 105 | 106 | @property 107 | def as_xml(self) -> str: 108 | return f""" 109 | 110 | {self.status} 111 | {self.message} 112 | 113 | """ 114 | 115 | def compare_lines_with_tolerance(expected: str, actual: str, tolerance: float = 1e-9) -> bool: 116 | """ 117 | Compare two lines of output with a tolerance for floating point numbers. 118 | """ 119 | expected_lines = expected.strip().split('\n') 120 | actual_lines = actual.strip().split('\n') 121 | 122 | if len(expected_lines) != len(actual_lines): 123 | return False 124 | 125 | for expected_line, actual_line in zip(expected_lines, actual_lines): 126 | expected_match = re.match(r"Case #\d+: (.+)", expected_line) 127 | actual_match = re.match(r"Case #\d+: (.+)", actual_line) 128 | 129 | if not expected_match or not actual_match: 130 | return False 131 | 132 | expected_values = expected_match.group(1).split() 133 | actual_values = actual_match.group(1).split() 134 | 135 | if len(expected_values) != len(actual_values): 136 | return False 137 | 138 | for expected_value, actual_value in zip(expected_values, actual_values): 139 | try: 140 | expected_float = float(expected_value) 141 | actual_float = float(actual_value) 142 | if not math.isclose(expected_float, actual_float, rel_tol=tolerance): 143 | return False 144 | except ValueError: 145 | if expected_value != actual_value: 146 | return False 147 | 148 | return True 149 | 150 | async def exec_program(program, input_data, expected_output, timeout): 151 | try: 152 | process = await asyncio.create_subprocess_exec( 153 | sys.executable, "-c", program, 154 | stdin=asyncio.subprocess.PIPE, 155 | stdout=asyncio.subprocess.PIPE, 156 | stderr=asyncio.subprocess.PIPE 157 | ) 158 | 159 | try: 160 | stdout, stderr = await asyncio.wait_for(process.communicate(input=input_data.encode()), timeout=timeout) 161 | except asyncio.TimeoutError: 162 | process.kill() 163 | return TestReport( 164 | status="timeout", 165 | message=f"Took too long! Your program timed out after {timeout} seconds of execution." 166 | ) 167 | 168 | if process.returncode != 0: 169 | return TestReport( 170 | status="error", message=f"Program execution failed: {stderr.decode()}" 171 | ) 172 | else: 173 | if compare_lines_with_tolerance(expected_output, stdout.decode()): 174 | return TestReport( 175 | status="passed", message="Yay! Your program ran successfully" 176 | ) 177 | else: 178 | return TestReport( 179 | status="failed", 180 | message=f"\n{expected_output}\n---\n\n{stdout.decode()}", 181 | ) 182 | except Exception: 183 | return TestReport( 184 | status="error", message=f"An error occurred: {traceback.format_exc()}" 185 | ) 186 | 187 | @weave.op 188 | async def check_correctness( 189 | program: str, input_data: str, expected_output: str, timeout: float 190 | ) -> TestReport: 191 | return await exec_program(program, input_data, expected_output, timeout) 192 | 193 | 194 | @weave.op 195 | async def format_response(text: str, model: Any, temperature: float = 0.1) -> Any: 196 | formatted_response = await async_client.chat.completions.create( 197 | model=FAST_LLM, 198 | # Instructor adds a system message by default about how to format the response given the response model. 199 | messages=[ 200 | { 201 | "role": "user", 202 | "content": f"Extract the relevant information from the following document and return it in valid JSON\n\n{text}", 203 | } 204 | ], 205 | temperature=temperature, 206 | response_model=model, 207 | max_retries=2, 208 | max_tokens=MAX_TOKENS, 209 | ) 210 | return formatted_response 211 | 212 | 213 | class Problem(BaseModel): 214 | problem_dir: pathlib.Path = Field( 215 | ..., description="The path to the problem directory" 216 | ) 217 | problem_name: str = Field(..., description="The name of the problem") 218 | problem_description: str = Field(..., description="The description of the problem") 219 | sample_input: str = Field(..., description="The sample input of the problem") 220 | sample_output: str = Field(..., description="The sample output of the problem") 221 | problem_input: pathlib.Path = Field(..., description="The path to the input file") 222 | problem_output: pathlib.Path = Field(..., description="The path to the output file") 223 | 224 | @property 225 | def as_xml(self) -> str: 226 | return f""" 227 | 228 | 229 | {remove_extra_newlines(self.problem_description)} 230 | 231 | 232 | 233 | {self.sample_input} 234 | 235 | 236 | {self.sample_output} 237 | 238 | 239 | 240 | """ 241 | 242 | @classmethod 243 | def from_name(cls, problem_name: str, folder_path: Path): 244 | description_path = folder_path / f"{problem_name}.md" 245 | sample_input_path = folder_path / f"{problem_name}_sample_input.txt" 246 | sample_output_path = folder_path / f"{problem_name}_sample_output.txt" 247 | input_path = folder_path / f"{problem_name}.in" 248 | output_path = folder_path / f"{problem_name}.out" 249 | 250 | return cls.from_files( 251 | problem_name=problem_name, 252 | description_path=description_path, 253 | sample_input_path=sample_input_path, 254 | sample_output_path=sample_output_path, 255 | input_path=input_path, 256 | output_path=output_path, 257 | ) 258 | 259 | @classmethod 260 | def from_files( 261 | cls, 262 | problem_name: str, 263 | description_path: Path, 264 | sample_input_path: Path, 265 | sample_output_path: Path, 266 | input_path: Path, 267 | output_path: Path = None, 268 | ): 269 | return cls( 270 | problem_name=problem_name, 271 | problem_description=description_path.read_text(), 272 | sample_input=sample_input_path.read_text(), 273 | sample_output=sample_output_path.read_text(), 274 | problem_input=input_path, 275 | problem_output=output_path if output_path else input_path.with_suffix('.out'), 276 | problem_dir=input_path.parent, 277 | ) 278 | 279 | def find_problems(folder: Path) -> list[dict]: 280 | """ 281 | Find all the problems in the given folder. 282 | """ 283 | problems = [] 284 | 285 | # search for all files ending in .in 286 | problem_names = [file.stem for file in folder.glob("**/*.in")] 287 | for problem_name in problem_names: 288 | try: 289 | problems.append(Problem.from_name(problem_name, folder)) 290 | except Exception as e: 291 | logging.error(f"Error loading problem {problem_name}: {e}") 292 | logging.info(f"Found {len(problems)} problems") 293 | return problems 294 | 295 | class Analysis(BaseModel): 296 | core_question: str = Field(..., description="Core question of the problem") 297 | problem_solving_info: List[str] = Field( 298 | ..., description="Problem-solving information related to the core question" 299 | ) 300 | algorithm: str = Field(..., description="Algorithm to solve the problem") 301 | tutorial: str = Field(..., description="Tutorial on the algorithm") 302 | plan: str = Field(..., description="Step by step plan to solve the problem") 303 | pseudocode: str = Field(..., description="Pseudocode to solve the problem") 304 | 305 | @property 306 | def as_xml(self) -> str: 307 | return f""" 308 | 309 | {self.core_question} 310 | 311 | 312 | {self.problem_solving_info} 313 | 314 | 315 | {self.algorithm} 316 | 317 | 318 | {self.tutorial} 319 | 320 | 321 | {self.plan} 322 | 323 | 324 | {self.pseudocode} 325 | 326 | """ 327 | 328 | 329 | class Solution(Analysis): 330 | source_code: str = Field( 331 | ..., description="Valid Python3 sourcecode to solve the problem." 332 | ) 333 | 334 | @property 335 | def as_xml(self) -> str: 336 | return f""" 337 | 338 | {super().as_xml} 339 | 340 | {self.source_code} 341 | 342 | 343 | """ 344 | 345 | 346 | class Reflection(BaseModel): 347 | reflection: str = Field( 348 | ..., 349 | description="Reflection on the problem, your solution, and the correct answer.", 350 | ) 351 | keywords: List[str] = Field( 352 | ..., 353 | description="Keywords that describe the type of your errors from most general to most specific.", 354 | ) 355 | step_by_step_solution: str = Field( 356 | ..., 357 | description="Step by step solution to the problem based on your knowledge of the correct answer.", 358 | ) 359 | instructions: List[str] = Field( 360 | ..., 361 | description="Detailed instructions to help you correctly solve this problem in the future.", 362 | ) 363 | general_advice: List[str] = Field( 364 | ..., 365 | description="General advice to help you solve similar types of problems in the future.", 366 | ) 367 | 368 | @property 369 | def as_xml(self) -> str: 370 | return f""" 371 | 372 | 373 | {self.reflection} 374 | 375 | 376 | {self.keywords} 377 | 378 | 379 | {self.step_by_step_solution} 380 | 381 | 382 | {self.instructions} 383 | 384 | 385 | {self.general_advice} 386 | 387 | 388 | """ 389 | 390 | 391 | def format_example(example: dict) -> str: 392 | formatted_doc = f""" 393 | 394 | 395 | {example['description']} 396 | 397 | 398 | 399 | {example['code']} 400 | 401 | """ 402 | return formatted_doc 403 | 404 | 405 | def format_examples(examples: List[dict], analyses: List[Analysis]) -> str: 406 | def format_question(example: dict) -> str: 407 | return f""" 408 | 409 | 410 | {example['description']} 411 | 412 | 413 | """ 414 | 415 | def format_solution(analysis: Analysis, example: dict) -> str: 416 | return f""" 417 | 418 | {analysis.as_xml} 419 | 420 | {example['code']} 421 | 422 | 423 | """ 424 | 425 | messages = "" 426 | for example, analysis in zip(examples, analyses): 427 | messages += f"\n{format_question(example)}\n{format_solution(analysis, example)}\n" 428 | return messages.strip() 429 | --------------------------------------------------------------------------------